import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
land = pd.read_csv("data/CountyData/5296US_landarea.csv")
display(land.info())
display(land.isna().sum())
display(land)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3111 entries, 0 to 3110 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 3111 non-null int64 1 State 3111 non-null object 2 County 3111 non-null object 3 Land Area 3111 non-null float64 dtypes: float64(1), int64(1), object(2) memory usage: 97.3+ KB
None
FIPS 0 State 0 County 0 Land Area 0 dtype: int64
| FIPS | State | County | Land Area | |
|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga | 1543.7 |
| 1 | 1003 | Alabama | Baldwin | 4135.0 |
| 2 | 1005 | Alabama | Barbour | 2292.1 |
| 3 | 1007 | Alabama | Bibb | 1611.9 |
| 4 | 1009 | Alabama | Blount | 1672.3 |
| ... | ... | ... | ... | ... |
| 3106 | 56037 | Wyoming | Sweetwater | 27003.0 |
| 3107 | 56039 | Wyoming | Teton | 10380.6 |
| 3108 | 56041 | Wyoming | Uinta | 5391.7 |
| 3109 | 56043 | Wyoming | Washakie | 5802.0 |
| 3110 | 56045 | Wyoming | Weston | 6210.6 |
3111 rows × 4 columns
water = pd.read_csv("data/CountyData/5296US_waterarea.csv")
display(water.info())
display(water.isna().sum())
display(water)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3111 entries, 0 to 3110 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 3111 non-null int64 1 State 3111 non-null object 2 County 3111 non-null object 3 Water Area 2758 non-null float64 dtypes: float64(1), int64(1), object(2) memory usage: 97.3+ KB
None
FIPS 0 State 0 County 0 Water Area 353 dtype: int64
| FIPS | State | County | Water Area | |
|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga | 22.0 |
| 1 | 1003 | Alabama | Baldwin | 1115.1 |
| 2 | 1005 | Alabama | Barbour | 50.7 |
| 3 | 1007 | Alabama | Bibb | 8.1 |
| 4 | 1009 | Alabama | Blount | 12.9 |
| ... | ... | ... | ... | ... |
| 3106 | 56037 | Wyoming | Sweetwater | 170.0 |
| 3107 | 56039 | Wyoming | Teton | 554.3 |
| 3108 | 56041 | Wyoming | Uinta | 15.3 |
| 3109 | 56043 | Wyoming | Washakie | 7.0 |
| 3110 | 56045 | Wyoming | Weston | 5.7 |
3111 rows × 4 columns
pop = pd.read_csv("data/CountyData/5296US_pop.csv")
display(pop.info())
display(pop.isna().sum())
display(pop)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3111 entries, 0 to 3110 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 3111 non-null int64 1 State 3111 non-null object 2 County 3111 non-null object 3 Date 3111 non-null int64 4 Population 3111 non-null int64 dtypes: int64(3), object(2) memory usage: 121.6+ KB
None
FIPS 0 State 0 County 0 Date 0 Population 0 dtype: int64
| FIPS | State | County | Date | Population | |
|---|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga | 1990 | 34222 |
| 1 | 1003 | Alabama | Baldwin | 1990 | 98280 |
| 2 | 1005 | Alabama | Barbour | 1990 | 25417 |
| 3 | 1007 | Alabama | Bibb | 1990 | 16576 |
| 4 | 1009 | Alabama | Blount | 1990 | 39248 |
| ... | ... | ... | ... | ... | ... |
| 3106 | 56037 | Wyoming | Sweetwater | 1990 | 38823 |
| 3107 | 56039 | Wyoming | Teton | 1990 | 11172 |
| 3108 | 56041 | Wyoming | Uinta | 1990 | 18705 |
| 3109 | 56043 | Wyoming | Washakie | 1990 | 8388 |
| 3110 | 56045 | Wyoming | Weston | 1990 | 6518 |
3111 rows × 5 columns
We notice that the three dataframes have FIPS,State and County columns and the same number of entries. so we can merge them into one dataframe.
But before we merge, we have to impute the missing values founded in water (353 water area). That will be done by:
1- copy the dataframe to cwater.
2- factorize the string columns into numeric values
3- impute the missing values with KNNImputer with n_neighbors = 10, thus to keep as close as possible to the real numbers.
4- replace water area with the imputed one.
cwater = water.copy()
for col in cwater.columns:
if cwater[col].dtype == 'object':
cwater[col] = pd.factorize(cwater[col])[0]
knn = KNNImputer(n_neighbors=10).fit(cwater)
cwater = pd.DataFrame(knn.transform(cwater), columns=water.columns)
display(cwater.isna().sum())
FIPS 0 State 0 County 0 Water Area 0 dtype: int64
water['Water Area'] = cwater['Water Area']
countyData = pd.merge(land, pd.merge(water,pop))
# lower case headers
countyData.columns = countyData.columns.str.lower()
# lower case string values
countyData = countyData.applymap(lambda s: s.lower() if type(s) == str else s)
# check the final output
display(countyData.head(10))
display(countyData.isna().sum())
| fips | state | county | land area | water area | date | population | |
|---|---|---|---|---|---|---|---|
| 0 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 |
| 1 | 1003 | alabama | baldwin | 4135.0 | 1115.1 | 1990 | 98280 |
| 2 | 1005 | alabama | barbour | 2292.1 | 50.7 | 1990 | 25417 |
| 3 | 1007 | alabama | bibb | 1611.9 | 8.1 | 1990 | 16576 |
| 4 | 1009 | alabama | blount | 1672.3 | 12.9 | 1990 | 39248 |
| 5 | 1011 | alabama | bullock | 1618.9 | 2.7 | 1990 | 11042 |
| 6 | 1013 | alabama | butler | 2012.3 | 2.7 | 1990 | 21892 |
| 7 | 1015 | alabama | calhoun | 1576.0 | 10.0 | 1990 | 116034 |
| 8 | 1017 | alabama | chambers | 1547.3 | 14.9 | 1990 | 36876 |
| 9 | 1019 | alabama | cherokee | 1432.7 | 121.3 | 1990 | 19543 |
fips 0 state 0 county 0 land area 0 water area 0 date 0 population 0 dtype: int64
plt.figure(figsize=(12,12))
sns.heatmap(countyData.corr(), annot=True)
plt.title("CountyData features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
plt.figure(figsize=(20,20))
sns.pairplot(countyData, corner=True)
plt.show()
<Figure size 1440x1440 with 0 Axes>
plt.figure(figsize=(15,15))
sns.countplot(data=countyData, x='state')
plt.xticks(rotation = 75)
plt.show()
bcb = pd.read_csv("data/EconomicData/BCB-UDJIAD1.csv")
display(bcb.info())
display(bcb.isna().sum())
display(bcb)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32601 entries, 0 to 32600 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 32601 non-null object 1 Value 32601 non-null float64 dtypes: float64(1), object(1) memory usage: 509.5+ KB
None
Date 0 Value 0 dtype: int64
| Date | Value | |
|---|---|---|
| 0 | 2016-04-15 | 17897.46 |
| 1 | 2016-04-14 | 17926.43 |
| 2 | 2016-04-13 | 17908.28 |
| 3 | 2016-04-12 | 17721.25 |
| 4 | 2016-04-11 | 17556.41 |
| ... | ... | ... |
| 32596 | 1896-07-18 | 31.50 |
| 32597 | 1896-07-17 | 31.94 |
| 32598 | 1896-07-16 | 31.95 |
| 32599 | 1896-07-15 | 32.28 |
| 32600 | 1896-07-14 | 33.43 |
32601 rows × 2 columns
Date column is considered as string, in order to make benefit of it we will cast into datetime and then create three new columns to replace date, which are (day, month, year) all of numeric types. to keep the dataframe as close as possible we will rearrange the columns.
# lower case headers
bcb.columns = bcb.columns.str.lower()
bcb['date'] = pd.to_datetime(bcb['date'])
bcb['day'] = pd.to_numeric(bcb.date.dt.strftime('%d'))
bcb['month'] = pd.to_numeric(bcb.date.dt.strftime('%m'))
bcb['year'] = pd.to_numeric(bcb.date.dt.strftime('%Y'))
bcb = bcb[['day', 'month', 'year', 'value']]
display(bcb)
| day | month | year | value | |
|---|---|---|---|---|
| 0 | 15 | 4 | 2016 | 17897.46 |
| 1 | 14 | 4 | 2016 | 17926.43 |
| 2 | 13 | 4 | 2016 | 17908.28 |
| 3 | 12 | 4 | 2016 | 17721.25 |
| 4 | 11 | 4 | 2016 | 17556.41 |
| ... | ... | ... | ... | ... |
| 32596 | 18 | 7 | 1896 | 31.50 |
| 32597 | 17 | 7 | 1896 | 31.94 |
| 32598 | 16 | 7 | 1896 | 31.95 |
| 32599 | 15 | 7 | 1896 | 32.28 |
| 32600 | 14 | 7 | 1896 | 33.43 |
32601 rows × 4 columns
plt.figure(figsize=(10,10))
sns.heatmap(bcb.corr(), annot=True)
plt.title("BCB features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
We noticed a fairly high correlation between year and value, so we will visualize this further
plt.figure(figsize=(12,12))
sns.scatterplot(data=bcb, x='year', y='value', hue='month')
plt.title("Year Vs. Value, by Month", fontsize=15, color='red')
plt.xticks(rotation = 75)
plt.show()
county1 = pd.read_csv("data/EconomicData/county1.csv")
display(county1.info())
display(county1.isna().sum())
display(county1)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3272 entries, 0 to 3271 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Table C1. Median Household Income by County: 1969, 1979, 1989, and 1999 3209 non-null object 1 Unnamed: 1 3231 non-null object 2 Unnamed: 2 3193 non-null object 3 Unnamed: 3 3193 non-null object 4 Unnamed: 4 3186 non-null object 5 Unnamed: 5 1 non-null object dtypes: object(6) memory usage: 153.5+ KB
None
Table C1. Median Household Income by County: 1969, 1979, 1989, and 1999 63 Unnamed: 1 41 Unnamed: 2 79 Unnamed: 3 79 Unnamed: 4 86 Unnamed: 5 3271 dtype: int64
| Table C1. Median Household Income by County: 1969, 1979, 1989, and 1999 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | |
|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | (In current dollars) | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | 1999 | 1989 | 1979 | 1969 | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 3267 | (301) 457-3242 | NaN | NaN | NaN | NaN | NaN |
| 3268 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3269 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3270 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3271 | NaN | --------- | NaN | NaN | NaN | NaN |
3272 rows × 6 columns
We notice that:
1- the dataframe has multiple line header.
2- has extra rows in the end.
3- has extra column.
so we have to fix the issues.
county1 = county1.loc[5:3258]
county1.drop(columns=['Unnamed: 5'], inplace=True)
county1.columns = ['median household by county', 'mhs1999', 'mhs1989', 'mhs1979', 'mhs1969']
display(county1.info())
display(county1.isna().sum())
display(county1)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3254 entries, 5 to 3258 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 median household by county 3200 non-null object 1 mhs1999 3228 non-null object 2 mhs1989 3191 non-null object 3 mhs1979 3191 non-null object 4 mhs1969 3184 non-null object dtypes: object(5) memory usage: 127.2+ KB
None
median household by county 54 mhs1999 26 mhs1989 63 mhs1979 63 mhs1969 70 dtype: int64
| median household by county | mhs1999 | mhs1989 | mhs1979 | mhs1969 | |
|---|---|---|---|---|---|
| 5 | United States | 41,994 | 30,056 | 16,841 | 8,486 |
| 6 | NaN | NaN | NaN | NaN | NaN |
| 7 | Alabama | 34,135 | 23,597 | 13,669 | 6,419 |
| 8 | Autauga County, AL | 42,013 | 28,337 | 16,524 | 6,808 |
| 9 | Baldwin County, AL | 40,250 | 25,712 | 14,614 | 6,430 |
| ... | ... | ... | ... | ... | ... |
| 3254 | Sweetwater County, WY | 46,537 | 36,210 | 24,114 | 8,284 |
| 3255 | Teton County, WY | 54,614 | 31,586 | 18,442 | 8,410 |
| 3256 | Uinta County, WY | 44,544 | 33,259 | 22,584 | 8,409 |
| 3257 | Washakie County, WY | 34,943 | 25,172 | 18,239 | 7,588 |
| 3258 | Weston County, WY | 32,348 | 26,213 | 20,021 | 7,796 |
3254 rows × 5 columns
All mhs columns should be of a numeric type, but instead got object, that is caused by missing values. let's explore that missing values and replace them with 0.
for col in county1.columns:
print(col)
print(county1[col].unique())
print('---------------------')
median household by county [' United States' nan ' Alabama' ... 'Uinta County, WY' 'Washakie County, WY' 'Weston County, WY'] --------------------- mhs1999 ['41,994' nan '34,135' ... '44,544' '34,943' '32,348'] --------------------- mhs1989 ['30,056' nan '23,597' ... '33,259' '25,172' '26,213'] --------------------- mhs1979 ['16,841' nan '13,669' ... '22,584' '18,239' '20,021'] --------------------- mhs1969 ['8,486' nan '6,419' ... '8,410' '7,588' '7,796'] ---------------------
county1.replace(' ',0, inplace=True)
county1.replace(np.nan,0, inplace=True)
# check if we still have missing values
display(county1.isna().sum())
median household by county 0 mhs1999 0 mhs1989 0 mhs1979 0 mhs1969 0 dtype: int64
Now the numers are typed as string with comma separation as thousands. we have to drop that commas
for col in county1.columns:
if col == 'median household by county':
continue
county1[col].replace(np.nan,0,inplace=True)
county1[col].replace(' ',0,inplace=True)
county1[col].replace('(NA)',0,inplace=True)
county1[col].replace('(a)',0,inplace=True)
county1[col] = county1[col].replace({',':''},regex=True).apply(pd.to_numeric,1)
display(county1.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3254 entries, 5 to 3258 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 median household by county 3254 non-null object 1 mhs1999 3254 non-null int64 2 mhs1989 3254 non-null int64 3 mhs1979 3254 non-null int64 4 mhs1969 3254 non-null int64 dtypes: int64(4), object(1) memory usage: 127.2+ KB
None
display(county1)
| median household by county | mhs1999 | mhs1989 | mhs1979 | mhs1969 | |
|---|---|---|---|---|---|
| 5 | United States | 41994 | 30056 | 16841 | 8486 |
| 6 | 0 | 0 | 0 | 0 | 0 |
| 7 | Alabama | 34135 | 23597 | 13669 | 6419 |
| 8 | Autauga County, AL | 42013 | 28337 | 16524 | 6808 |
| 9 | Baldwin County, AL | 40250 | 25712 | 14614 | 6430 |
| ... | ... | ... | ... | ... | ... |
| 3254 | Sweetwater County, WY | 46537 | 36210 | 24114 | 8284 |
| 3255 | Teton County, WY | 54614 | 31586 | 18442 | 8410 |
| 3256 | Uinta County, WY | 44544 | 33259 | 22584 | 8409 |
| 3257 | Washakie County, WY | 34943 | 25172 | 18239 | 7588 |
| 3258 | Weston County, WY | 32348 | 26213 | 20021 | 7796 |
3254 rows × 5 columns
county1['median_code'] = pd.factorize(county1['median household by county'])[0]
display(county1)
| median household by county | mhs1999 | mhs1989 | mhs1979 | mhs1969 | median_code | |
|---|---|---|---|---|---|---|
| 5 | United States | 41994 | 30056 | 16841 | 8486 | 0 |
| 6 | 0 | 0 | 0 | 0 | 0 | 1 |
| 7 | Alabama | 34135 | 23597 | 13669 | 6419 | 2 |
| 8 | Autauga County, AL | 42013 | 28337 | 16524 | 6808 | 3 |
| 9 | Baldwin County, AL | 40250 | 25712 | 14614 | 6430 | 4 |
| ... | ... | ... | ... | ... | ... | ... |
| 3254 | Sweetwater County, WY | 46537 | 36210 | 24114 | 8284 | 3194 |
| 3255 | Teton County, WY | 54614 | 31586 | 18442 | 8410 | 3195 |
| 3256 | Uinta County, WY | 44544 | 33259 | 22584 | 8409 | 3196 |
| 3257 | Washakie County, WY | 34943 | 25172 | 18239 | 7588 | 3197 |
| 3258 | Weston County, WY | 32348 | 26213 | 20021 | 7796 | 3198 |
3254 rows × 6 columns
plt.figure(figsize=(12,12))
sns.heatmap(county1.corr(), annot=True)
plt.title("County1 features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
plt.figure(figsize=(20,20))
sns.pairplot(county1, corner=True)
plt.show()
<Figure size 1440x1440 with 0 Axes>
dow = pd.read_csv("data/EconomicData/Dow Jones Industrial Average Historical Data 2.csv")
display(dow.info())
display(dow.isna().sum())
display(dow)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2766 entries, 0 to 2765 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 2766 non-null object 1 Price 2766 non-null object 2 Open 2766 non-null object 3 High 2766 non-null object 4 Low 2766 non-null object 5 Vol. 2766 non-null object 6 Change % 2766 non-null object dtypes: object(7) memory usage: 151.4+ KB
None
Date 0 Price 0 Open 0 High 0 Low 0 Vol. 0 Change % 0 dtype: int64
| Date | Price | Open | High | Low | Vol. | Change % | |
|---|---|---|---|---|---|---|---|
| 0 | Dec 31, 2019 | 28,538.44 | 28,414.64 | 28,547.35 | 28,376.49 | 193.34M | 0.27% |
| 1 | Dec 30, 2019 | 28,462.14 | 28,654.76 | 28,664.69 | 28,428.98 | 185.07M | -0.64% |
| 2 | Dec 27, 2019 | 28,645.26 | 28,675.34 | 28,701.66 | 28,608.98 | 184.93M | 0.08% |
| 3 | Dec 26, 2019 | 28,621.39 | 28,539.46 | 28,624.10 | 28,535.15 | 155.97M | 0.37% |
| 4 | Dec 24, 2019 | 28,515.45 | 28,572.57 | 28,576.80 | 28,503.21 | 95.29M | -0.13% |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2761 | Jan 09, 2009 | 8,599.18 | 8,738.80 | 8,800.45 | 8,541.75 | - | -1.64% |
| 2762 | Jan 08, 2009 | 8,742.46 | 8,769.94 | 8,807.14 | 8,593.52 | - | -0.31% |
| 2763 | Jan 07, 2009 | 8,769.70 | 8,996.94 | 8,996.94 | 8,690.45 | - | -2.72% |
| 2764 | Jan 06, 2009 | 9,015.10 | 8,954.57 | 9,175.19 | 8,868.07 | - | 0.69% |
| 2765 | Jan 05, 2009 | 8,952.89 | 9,027.13 | 9,093.47 | 8,841.70 | - | -0.91% |
2766 rows × 7 columns
We have to preproccess this :
1- Date to datetime then replace by three numeric columns (day, month, year)
2- the other columns are of numeric types, so we have to drop commas, M that says million, and drop %.
# lower case headers
dow.columns = dow.columns.str.lower()
dow['date'] = pd.to_datetime(dow['date'])
dow['day'] = pd.to_numeric(dow.date.dt.strftime('%d'))
dow['month'] = pd.to_numeric(dow.date.dt.strftime('%m'))
dow['year'] = pd.to_numeric(dow.date.dt.strftime('%Y'))
dow = dow[['day', 'month', 'year', 'price', 'open', 'high', 'low', 'vol.', 'change %']]
display(dow.info())
display(dow)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2766 entries, 0 to 2765 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 day 2766 non-null int64 1 month 2766 non-null int64 2 year 2766 non-null int64 3 price 2766 non-null object 4 open 2766 non-null object 5 high 2766 non-null object 6 low 2766 non-null object 7 vol. 2766 non-null object 8 change % 2766 non-null object dtypes: int64(3), object(6) memory usage: 194.6+ KB
None
| day | month | year | price | open | high | low | vol. | change % | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 12 | 2019 | 28,538.44 | 28,414.64 | 28,547.35 | 28,376.49 | 193.34M | 0.27% |
| 1 | 30 | 12 | 2019 | 28,462.14 | 28,654.76 | 28,664.69 | 28,428.98 | 185.07M | -0.64% |
| 2 | 27 | 12 | 2019 | 28,645.26 | 28,675.34 | 28,701.66 | 28,608.98 | 184.93M | 0.08% |
| 3 | 26 | 12 | 2019 | 28,621.39 | 28,539.46 | 28,624.10 | 28,535.15 | 155.97M | 0.37% |
| 4 | 24 | 12 | 2019 | 28,515.45 | 28,572.57 | 28,576.80 | 28,503.21 | 95.29M | -0.13% |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2761 | 9 | 1 | 2009 | 8,599.18 | 8,738.80 | 8,800.45 | 8,541.75 | - | -1.64% |
| 2762 | 8 | 1 | 2009 | 8,742.46 | 8,769.94 | 8,807.14 | 8,593.52 | - | -0.31% |
| 2763 | 7 | 1 | 2009 | 8,769.70 | 8,996.94 | 8,996.94 | 8,690.45 | - | -2.72% |
| 2764 | 6 | 1 | 2009 | 9,015.10 | 8,954.57 | 9,175.19 | 8,868.07 | - | 0.69% |
| 2765 | 5 | 1 | 2009 | 8,952.89 | 9,027.13 | 9,093.47 | 8,841.70 | - | -0.91% |
2766 rows × 9 columns
# rename vol. to vol. (millions) since we are going to drop M
dow.rename(columns={'vol.': 'vol. (millions)'}, inplace=True)
display(dow)
C:\Users\lover\anaconda3\lib\site-packages\pandas\core\frame.py:5047: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy errors=errors,
| day | month | year | price | open | high | low | vol. (millions) | change % | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 31 | 12 | 2019 | 28,538.44 | 28,414.64 | 28,547.35 | 28,376.49 | 193.34M | 0.27% |
| 1 | 30 | 12 | 2019 | 28,462.14 | 28,654.76 | 28,664.69 | 28,428.98 | 185.07M | -0.64% |
| 2 | 27 | 12 | 2019 | 28,645.26 | 28,675.34 | 28,701.66 | 28,608.98 | 184.93M | 0.08% |
| 3 | 26 | 12 | 2019 | 28,621.39 | 28,539.46 | 28,624.10 | 28,535.15 | 155.97M | 0.37% |
| 4 | 24 | 12 | 2019 | 28,515.45 | 28,572.57 | 28,576.80 | 28,503.21 | 95.29M | -0.13% |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2761 | 9 | 1 | 2009 | 8,599.18 | 8,738.80 | 8,800.45 | 8,541.75 | - | -1.64% |
| 2762 | 8 | 1 | 2009 | 8,742.46 | 8,769.94 | 8,807.14 | 8,593.52 | - | -0.31% |
| 2763 | 7 | 1 | 2009 | 8,769.70 | 8,996.94 | 8,996.94 | 8,690.45 | - | -2.72% |
| 2764 | 6 | 1 | 2009 | 9,015.10 | 8,954.57 | 9,175.19 | 8,868.07 | - | 0.69% |
| 2765 | 5 | 1 | 2009 | 8,952.89 | 9,027.13 | 9,093.47 | 8,841.70 | - | -0.91% |
2766 rows × 9 columns
dow = dow.replace({',':'', 'M': '', '%': '', '-':0},regex=True).apply(pd.to_numeric,1)
display(dow.info())
display(dow.isna().sum())
display(dow)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2766 entries, 0 to 2765 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 day 2766 non-null float64 1 month 2766 non-null float64 2 year 2766 non-null float64 3 price 2766 non-null float64 4 open 2766 non-null float64 5 high 2766 non-null float64 6 low 2766 non-null float64 7 vol. (millions) 2766 non-null float64 8 change % 2766 non-null float64 dtypes: float64(9) memory usage: 194.6 KB
None
day 0 month 0 year 0 price 0 open 0 high 0 low 0 vol. (millions) 0 change % 0 dtype: int64
| day | month | year | price | open | high | low | vol. (millions) | change % | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 31.0 | 12.0 | 2019.0 | 28538.44 | 28414.64 | 28547.35 | 28376.49 | 193.34 | 0.27 |
| 1 | 30.0 | 12.0 | 2019.0 | 28462.14 | 28654.76 | 28664.69 | 28428.98 | 185.07 | 0.00 |
| 2 | 27.0 | 12.0 | 2019.0 | 28645.26 | 28675.34 | 28701.66 | 28608.98 | 184.93 | 0.08 |
| 3 | 26.0 | 12.0 | 2019.0 | 28621.39 | 28539.46 | 28624.10 | 28535.15 | 155.97 | 0.37 |
| 4 | 24.0 | 12.0 | 2019.0 | 28515.45 | 28572.57 | 28576.80 | 28503.21 | 95.29 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2761 | 9.0 | 1.0 | 2009.0 | 8599.18 | 8738.80 | 8800.45 | 8541.75 | 0.00 | 0.00 |
| 2762 | 8.0 | 1.0 | 2009.0 | 8742.46 | 8769.94 | 8807.14 | 8593.52 | 0.00 | 0.00 |
| 2763 | 7.0 | 1.0 | 2009.0 | 8769.70 | 8996.94 | 8996.94 | 8690.45 | 0.00 | 0.00 |
| 2764 | 6.0 | 1.0 | 2009.0 | 9015.10 | 8954.57 | 9175.19 | 8868.07 | 0.00 | 0.69 |
| 2765 | 5.0 | 1.0 | 2009.0 | 8952.89 | 9027.13 | 9093.47 | 8841.70 | 0.00 | 0.00 |
2766 rows × 9 columns
plt.figure(figsize=(15,15))
sns.heatmap(dow.corr(), annot=True)
plt.title("Dow Jones features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation =90)
plt.show()
We notice a very high correlation group consisted by (year, price, open, high, low) so we chose to drop all but price.
dow.drop(columns=['year', 'open', 'high', 'low'], inplace=True)
plt.figure(figsize=(20,20))
sns.pairplot(dow, corner=True)
plt.show()
<Figure size 1440x1440 with 0 Axes>
gdp = pd.read_excel("data/EconomicData/gdpcounty1218.xlsx")
display(gdp.info())
display(gdp.isna().sum())
display(gdp)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12459 entries, 0 to 12458 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 12456 non-null object 1 Unnamed: 1 12453 non-null object 2 Unnamed: 2 12453 non-null object 3 Unnamed: 3 12453 non-null object 4 Unnamed: 4 12453 non-null object 5 Unnamed: 5 12455 non-null object 6 Unnamed: 6 12453 non-null object 7 Unnamed: 7 12453 non-null object 8 Unnamed: 8 12453 non-null object dtypes: object(9) memory usage: 876.1+ KB
None
Unnamed: 0 3 Unnamed: 1 6 Unnamed: 2 6 Unnamed: 3 6 Unnamed: 4 6 Unnamed: 5 4 Unnamed: 6 6 Unnamed: 7 6 Unnamed: 8 6 dtype: int64
| Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | FIPS | Countyname | Postal | LineCode | IndustryName | Gross domestic product (GDP) by county | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | (thousands of dollars) | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | 2012 | 2013 | 2014 | 2015 |
| 3 | 01001 | Autauga | AL | 1 | All Industries | 1383941 | 1363368 | 1402516 | 1539406 |
| 4 | 01001 | Autauga | AL | 2 | Private goods-producing industries | 286396 | 310468 | 323582 | 346355 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12454 | 56045 | Weston | WY | 4 | Government and government enterprises | 54750 | 58758 | 59301 | 59461 |
| 12455 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12456 | Source: U.S. Bureau of Economic Analysis | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12457 | (D) Not shown to avoid disclosure of confident... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12458 | Note: Detail may not sum to higher-level aggre... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12459 rows × 9 columns
Also here we have multiple line header, and extra rows at the end. We will fix that
gdp = gdp.loc[3:12454]
gdp.columns = ['fips', 'county', 'postal', 'line_code', 'industry_name', 'gdp2012', 'gdp2013', 'gdp2014', 'gdp2015']
display(gdp.info())
display(gdp.isna().sum())
display(gdp)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12452 entries, 3 to 12454 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fips 12452 non-null object 1 county 12452 non-null object 2 postal 12452 non-null object 3 line_code 12452 non-null object 4 industry_name 12452 non-null object 5 gdp2012 12452 non-null object 6 gdp2013 12452 non-null object 7 gdp2014 12452 non-null object 8 gdp2015 12452 non-null object dtypes: object(9) memory usage: 875.7+ KB
None
fips 0 county 0 postal 0 line_code 0 industry_name 0 gdp2012 0 gdp2013 0 gdp2014 0 gdp2015 0 dtype: int64
| fips | county | postal | line_code | industry_name | gdp2012 | gdp2013 | gdp2014 | gdp2015 | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | 01001 | Autauga | AL | 1 | All Industries | 1383941 | 1363368 | 1402516 | 1539406 |
| 4 | 01001 | Autauga | AL | 2 | Private goods-producing industries | 286396 | 310468 | 323582 | 346355 |
| 5 | 01001 | Autauga | AL | 3 | Private services-providing industries | 948490 | 904599 | 928438 | 1037309 |
| 6 | 01001 | Autauga | AL | 4 | Government and government enterprises | 149055 | 148301 | 150496 | 155742 |
| 7 | 01003 | Baldwin | AL | 1 | All Industries | 5599194 | 6365080 | 6547396 | 6436107 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12450 | 56043 | Washakie | WY | 4 | Government and government enterprises | 62263 | 64489 | 65430 | 65738 |
| 12451 | 56045 | Weston | WY | 1 | All Industries | 332472 | 306384 | 312816 | 347555 |
| 12452 | 56045 | Weston | WY | 2 | Private goods-producing industries | 181482 | 149558 | 148903 | 175708 |
| 12453 | 56045 | Weston | WY | 3 | Private services-providing industries | 96240 | 98069 | 104612 | 112386 |
| 12454 | 56045 | Weston | WY | 4 | Government and government enterprises | 54750 | 58758 | 59301 | 59461 |
12452 rows × 9 columns
gdps = gdp[['gdp2012', 'gdp2013', 'gdp2014', 'gdp2015']]
display(gdps)
| gdp2012 | gdp2013 | gdp2014 | gdp2015 | |
|---|---|---|---|---|
| 3 | 1383941 | 1363368 | 1402516 | 1539406 |
| 4 | 286396 | 310468 | 323582 | 346355 |
| 5 | 948490 | 904599 | 928438 | 1037309 |
| 6 | 149055 | 148301 | 150496 | 155742 |
| 7 | 5599194 | 6365080 | 6547396 | 6436107 |
| ... | ... | ... | ... | ... |
| 12450 | 62263 | 64489 | 65430 | 65738 |
| 12451 | 332472 | 306384 | 312816 | 347555 |
| 12452 | 181482 | 149558 | 148903 | 175708 |
| 12453 | 96240 | 98069 | 104612 | 112386 |
| 12454 | 54750 | 58758 | 59301 | 59461 |
12452 rows × 4 columns
gdps = gdps.replace({np.nan:0, ' ':0, '-':0, '(D)':0},regex=True).apply(pd.to_numeric,1)
display(gdps.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12452 entries, 3 to 12454 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gdp2012 12452 non-null int64 1 gdp2013 12452 non-null int64 2 gdp2014 12452 non-null int64 3 gdp2015 12452 non-null int64 dtypes: int64(4) memory usage: 389.3 KB
None
gdp['fips'].unique()
array(['01001', '01003', '01005', ..., '56041', '56043', '56045'],
dtype=object)
gdp['fips'] = pd.to_numeric(gdp['fips'])
gdp['line_code'] = pd.to_numeric(gdp['line_code'])
gdp = pd.concat([gdp[['fips','county','postal', 'line_code', 'industry_name']], gdps], axis = 1)
display(gdp.info())
display(gdp.isna().sum())
display(gdp)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12452 entries, 3 to 12454 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fips 12452 non-null int64 1 county 12452 non-null object 2 postal 12452 non-null object 3 line_code 12452 non-null int64 4 industry_name 12452 non-null object 5 gdp2012 12452 non-null int64 6 gdp2013 12452 non-null int64 7 gdp2014 12452 non-null int64 8 gdp2015 12452 non-null int64 dtypes: int64(6), object(3) memory usage: 875.7+ KB
None
fips 0 county 0 postal 0 line_code 0 industry_name 0 gdp2012 0 gdp2013 0 gdp2014 0 gdp2015 0 dtype: int64
| fips | county | postal | line_code | industry_name | gdp2012 | gdp2013 | gdp2014 | gdp2015 | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | 1001 | Autauga | AL | 1 | All Industries | 1383941 | 1363368 | 1402516 | 1539406 |
| 4 | 1001 | Autauga | AL | 2 | Private goods-producing industries | 286396 | 310468 | 323582 | 346355 |
| 5 | 1001 | Autauga | AL | 3 | Private services-providing industries | 948490 | 904599 | 928438 | 1037309 |
| 6 | 1001 | Autauga | AL | 4 | Government and government enterprises | 149055 | 148301 | 150496 | 155742 |
| 7 | 1003 | Baldwin | AL | 1 | All Industries | 5599194 | 6365080 | 6547396 | 6436107 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12450 | 56043 | Washakie | WY | 4 | Government and government enterprises | 62263 | 64489 | 65430 | 65738 |
| 12451 | 56045 | Weston | WY | 1 | All Industries | 332472 | 306384 | 312816 | 347555 |
| 12452 | 56045 | Weston | WY | 2 | Private goods-producing industries | 181482 | 149558 | 148903 | 175708 |
| 12453 | 56045 | Weston | WY | 3 | Private services-providing industries | 96240 | 98069 | 104612 | 112386 |
| 12454 | 56045 | Weston | WY | 4 | Government and government enterprises | 54750 | 58758 | 59301 | 59461 |
12452 rows × 9 columns
plt.figure(figsize=(15,15))
sns.heatmap(gdp.corr(), annot=True)
plt.title("Gross domestic product (GDP) by county features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
We notice that all the years included identaclly correlated, so we will drop all but gdp2015.
gdp.drop(columns=['gdp2012', 'gdp2013', 'gdp2014'], inplace=True)
plt.figure(figsize=(20,20))
sns.pairplot(gdp, corner=True)
plt.show()
<Figure size 1440x1440 with 0 Axes>
plt.figure(figsize=(15,15))
sns.scatterplot(data=gdp, x='county', y='gdp2015', hue='industry_name')
plt.title("county Vs. gdp2015, by industry_name", fontsize=15, color='red')
plt.xticks(rotation = 75)
plt.show()
complete = pd.read_csv("data/GeneralDemographicData/county_complete.csv")
display(complete.info())
display(complete.isna().sum())
display(complete)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3142 entries, 0 to 3141 Columns: 188 entries, fips to white_not_hispanic_2019 dtypes: float64(166), int64(18), object(4) memory usage: 4.5+ MB
None
fips 0
state 0
name 0
pop2000 3
pop2010 0
..
uninsured_under_19_2019 0
uninsured_under_6_2019 0
veterans_2019 0
white_2019 0
white_not_hispanic_2019 0
Length: 188, dtype: int64
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 23.2 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | 1003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 13.4 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | 1005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 50.1 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | 1007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | NaN | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | 1009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 18.4 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037 | Wyoming | Sweetwater County | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | ... | NaN | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | 56039 | Wyoming | Teton County | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | ... | NaN | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | 56041 | Wyoming | Uinta County | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | ... | NaN | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | 56043 | Wyoming | Washakie County | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | ... | NaN | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | 56045 | Wyoming | Weston County | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | ... | NaN | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
3142 rows × 188 columns
We can see that there are missing values, and that this dataframe has a lot of columns so we can impute the missing values with KNNImputer with n_neighbors= 30.
In order to do so, we have 4 string columns that we will seperate and save in side. Then we copy the numeric columns into seperate dataframe and fit the knn.
Lastly, we recreate the full dataframe by concating the two sepearted dataframes.
complete.replace({'-':''},regex=True, inplace=True)
complete['uninsured_age_under_6_2017'] = pd.to_numeric(complete['uninsured_age_under_6_2017'])
cpObject = complete.select_dtypes('object')
cpNumeric = complete.select_dtypes(exclude='object')
display(cpObject)
display(cpNumeric)
| state | name | smoking_ban_2010 | |
|---|---|---|---|
| 0 | Alabama | Autauga County | none |
| 1 | Alabama | Baldwin County | none |
| 2 | Alabama | Barbour County | partial |
| 3 | Alabama | Bibb County | none |
| 4 | Alabama | Blount County | none |
| ... | ... | ... | ... |
| 3137 | Wyoming | Sweetwater County | none |
| 3138 | Wyoming | Teton County | partial |
| 3139 | Wyoming | Uinta County | none |
| 3140 | Wyoming | Washakie County | none |
| 3141 | Wyoming | Weston County | none |
3142 rows × 3 columns
| fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | pop2016 | pop2017 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | 55278.0 | 55504.0 | ... | 23.2 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | 1003 | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | 207509.0 | 212628.0 | ... | 13.4 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | 1005 | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | 25774.0 | 25270.0 | ... | 50.1 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | 1007 | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | 22633.0 | 22668.0 | ... | NaN | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | 1009 | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | 57562.0 | 58013.0 | ... | 18.4 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037 | 37613.0 | 43806 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | 44245.0 | 43534.0 | ... | NaN | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | 56039 | 18251.0 | 21294 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | 23180.0 | 23265.0 | ... | NaN | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | 56041 | 19742.0 | 21118 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | 20711.0 | 20495.0 | ... | NaN | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | 56043 | 8289.0 | 8533 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | 8188.0 | 8064.0 | ... | NaN | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | 56045 | 6644.0 | 7208 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | 7198.0 | 6927.0 | ... | NaN | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
3142 rows × 185 columns
knn = KNNImputer(n_neighbors=30).fit(cpNumeric)
imputed = pd.DataFrame(knn.transform(cpNumeric), columns = cpNumeric.columns)
display(imputed)
| fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | pop2016 | pop2017 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001.0 | 43671.0 | 54571.0 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | 55278.0 | 55504.0 | ... | 23.200000 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | 1003.0 | 140415.0 | 182265.0 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | 207509.0 | 212628.0 | ... | 13.400000 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | 1005.0 | 29038.0 | 27457.0 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | 25774.0 | 25270.0 | ... | 50.100000 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | 1007.0 | 20826.0 | 22915.0 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | 22633.0 | 22668.0 | ... | 34.113333 | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | 1009.0 | 51024.0 | 57322.0 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | 57562.0 | 58013.0 | ... | 18.400000 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | 56037.0 | 37613.0 | 43806.0 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | 44732.0 | 44245.0 | 43534.0 | ... | 20.110000 | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | 56039.0 | 18251.0 | 21294.0 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | 23029.0 | 23180.0 | 23265.0 | ... | 20.320000 | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | 56041.0 | 19742.0 | 21118.0 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | 20780.0 | 20711.0 | 20495.0 | ... | 25.250000 | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | 56043.0 | 8289.0 | 8533.0 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | 8296.0 | 8188.0 | 8064.0 | ... | 33.856667 | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | 56045.0 | 6644.0 | 7208.0 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | 7181.0 | 7198.0 | 6927.0 | ... | 34.710000 | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
3142 rows × 185 columns
complete = pd.concat([cpObject, imputed], axis=1)
display(complete.info())
display(complete.isna().sum())
display(complete)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3142 entries, 0 to 3141 Columns: 188 entries, state to white_not_hispanic_2019 dtypes: float64(185), object(3) memory usage: 4.5+ MB
None
state 0
name 0
smoking_ban_2010 26
fips 0
pop2000 0
..
uninsured_under_19_2019 0
uninsured_under_6_2019 0
veterans_2019 0
white_2019 0
white_not_hispanic_2019 0
Length: 188, dtype: int64
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Alabama | Autauga County | none | 1001.0 | 43671.0 | 54571.0 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | ... | 23.200000 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | Alabama | Baldwin County | none | 1003.0 | 140415.0 | 182265.0 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | ... | 13.400000 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | Alabama | Barbour County | partial | 1005.0 | 29038.0 | 27457.0 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | ... | 50.100000 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | Alabama | Bibb County | none | 1007.0 | 20826.0 | 22915.0 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | ... | 34.113333 | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | Alabama | Blount County | none | 1009.0 | 51024.0 | 57322.0 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | ... | 18.400000 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | Wyoming | Sweetwater County | none | 56037.0 | 37613.0 | 43806.0 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | ... | 20.110000 | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | Wyoming | Teton County | partial | 56039.0 | 18251.0 | 21294.0 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | ... | 20.320000 | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | Wyoming | Uinta County | none | 56041.0 | 19742.0 | 21118.0 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | ... | 25.250000 | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | Wyoming | Washakie County | none | 56043.0 | 8289.0 | 8533.0 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | ... | 33.856667 | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | Wyoming | Weston County | none | 56045.0 | 6644.0 | 7208.0 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | ... | 34.710000 | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
3142 rows × 188 columns
complete['smoking_ban_2010'].unique()
array(['none', 'partial', 'comprehensive', nan], dtype=object)
complete.replace(np.nan, 'none', inplace=True)
display(complete.isna().sum())
state 0
name 0
smoking_ban_2010 0
fips 0
pop2000 0
..
uninsured_under_19_2019 0
uninsured_under_6_2019 0
veterans_2019 0
white_2019 0
white_not_hispanic_2019 0
Length: 188, dtype: int64
# lower case string values
complete = complete.applymap(lambda s: s.lower() if type(s)==str else s)
display(complete.sample(20))
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1465 | mississippi | smith county | none | 28129.0 | 16182.0 | 16491.0 | 16531.0 | 16342.0 | 16233.0 | 16211.0 | ... | 34.476667 | 0.4 | 4.2 | 14.0 | 0.0 | 6.3 | 4.8 | 6.6 | 75.8 | 74.0 |
| 116 | arkansas | bradley county | none | 5011.0 | 12600.0 | 11508.0 | 11425.0 | 11262.0 | 11120.0 | 11007.0 | ... | 33.200000 | 0.2 | 7.5 | 11.0 | 0.0 | 4.6 | 6.0 | 9.3 | 65.9 | 55.5 |
| 1102 | kentucky | todd county | comprehensive | 21219.0 | 11971.0 | 12460.0 | 12410.0 | 12597.0 | 12450.0 | 12409.0 | ... | 35.673333 | 1.6 | 6.7 | 15.2 | 1.6 | 17.4 | 15.9 | 7.6 | 87.5 | 85.7 |
| 1918 | north carolina | davidson county | none | 37057.0 | 147246.0 | 162878.0 | 162972.0 | 163112.0 | 163398.0 | 163530.0 | ... | 23.600000 | 1.8 | 5.7 | 11.3 | 0.6 | 5.8 | 3.6 | 7.9 | 85.8 | 80.2 |
| 107 | arizona | pinal county | comprehensive | 4021.0 | 179727.0 | 375770.0 | 378226.0 | 381910.0 | 384258.0 | 394004.0 | ... | 17.500000 | 4.2 | 6.4 | 8.8 | 0.7 | 7.5 | 7.5 | 10.5 | 79.9 | 56.8 |
| 273 | colorado | jackson county | comprehensive | 8057.0 | 1577.0 | 1394.0 | 1358.0 | 1326.0 | 1337.0 | 1385.0 | ... | 35.460000 | 0.4 | 2.0 | 6.1 | 0.0 | 7.0 | 3.8 | 14.2 | 97.9 | 82.2 |
| 2968 | washington | jefferson county | none | 53031.0 | 25953.0 | 29872.0 | 29835.0 | 29782.0 | 29990.0 | 30135.0 | ... | 28.290000 | 3.8 | 5.5 | 5.4 | 0.2 | 3.8 | 0.3 | 13.8 | 90.9 | 88.4 |
| 725 | indiana | hamilton county | none | 18057.0 | 182740.0 | 274569.0 | 283273.0 | 289599.0 | 296789.0 | 302828.0 | ... | 5.300000 | 2.4 | 2.4 | 4.4 | 0.3 | 3.4 | 3.8 | 6.0 | 86.8 | 83.7 |
| 2442 | tennessee | cocke county | none | 47029.0 | 33565.0 | 35662.0 | 35401.0 | 35455.0 | 35325.0 | 35201.0 | ... | 40.000000 | 2.1 | 8.5 | 11.2 | 0.8 | 5.0 | 1.4 | 10.0 | 94.8 | 93.3 |
| 260 | colorado | denver county | partial | 8031.0 | 554636.0 | 600158.0 | 619356.0 | 633798.0 | 648049.0 | 663271.0 | ... | 18.200000 | 3.8 | 3.6 | 9.3 | 1.4 | 4.2 | 2.3 | 5.2 | 76.1 | 54.2 |
| 2982 | washington | skamania county | comprehensive | 53059.0 | 9872.0 | 11066.0 | 11150.0 | 11198.0 | 11308.0 | 11376.0 | ... | 33.906667 | 2.3 | 4.2 | 4.0 | 0.3 | 0.0 | 0.0 | 12.3 | 92.8 | 87.7 |
| 1113 | louisiana | acadia parish | partial | 22001.0 | 58861.0 | 61773.0 | 61826.0 | 61984.0 | 62284.0 | 62664.0 | ... | 35.600000 | 2.8 | 7.3 | 10.6 | 0.6 | 5.0 | 3.9 | 5.8 | 79.3 | 77.4 |
| 1839 | new york | cortland county | comprehensive | 36023.0 | 48599.0 | 49336.0 | 49373.0 | 49024.0 | 48912.0 | 48742.0 | ... | 18.200000 | 1.9 | 4.9 | 3.2 | 0.4 | 3.0 | 4.2 | 7.2 | 94.6 | 92.6 |
| 394 | georgia | bartow county | none | 13015.0 | 76019.0 | 100157.0 | 100213.0 | 100365.0 | 101056.0 | 101397.0 | ... | 18.200000 | 2.4 | 4.5 | 14.9 | 0.8 | 8.7 | 8.6 | 7.8 | 83.7 | 77.5 |
| 2805 | utah | weber county | partial | 49057.0 | 196533.0 | 231236.0 | 233890.0 | 236342.0 | 238222.0 | 240219.0 | ... | 12.500000 | 3.5 | 3.3 | 8.9 | 0.7 | 6.4 | 5.8 | 7.7 | 89.2 | 76.1 |
| 764 | indiana | randolph county | none | 18135.0 | 27401.0 | 26171.0 | 26021.0 | 25848.0 | 25610.0 | 25313.0 | ... | 31.396667 | 2.5 | 5.1 | 9.4 | 0.5 | 6.1 | 7.7 | 8.3 | 94.9 | 92.8 |
| 2215 | oregon | curry county | none | 41015.0 | 21137.0 | 22364.0 | 22477.0 | 22249.0 | 22217.0 | 22130.0 | ... | 28.160000 | 6.3 | 7.5 | 6.1 | 0.4 | 2.6 | 3.8 | 15.4 | 90.9 | 86.3 |
| 1295 | michigan | ogemaw county | none | 26129.0 | 21645.0 | 21699.0 | 21495.0 | 21365.0 | 21167.0 | 20961.0 | ... | 28.843333 | 1.3 | 7.4 | 6.5 | 0.3 | 6.3 | 7.9 | 10.6 | 96.5 | 94.6 |
| 1947 | north carolina | madison county | comprehensive | 37115.0 | 19635.0 | 20764.0 | 20853.0 | 20902.0 | 21171.0 | 21219.0 | ... | 35.683333 | 1.6 | 4.7 | 9.3 | 0.3 | 5.5 | 2.4 | 8.7 | 95.4 | 94.0 |
| 885 | iowa | woodbury county | comprehensive | 19193.0 | 103877.0 | 102172.0 | 102622.0 | 102290.0 | 102195.0 | 102200.0 | ... | 17.300000 | 3.5 | 4.1 | 5.7 | 0.5 | 1.9 | 0.5 | 7.0 | 85.7 | 73.0 |
20 rows × 188 columns
plt.figure(figsize=(15,15))
sns.scatterplot(data=complete, x='state', y='uninsured_age_under_6_2017', hue='smoking_ban_2010')
plt.title("State Vs. uninsured_age_under_6_2017, by smoking_ban_2010", fontsize=15, color='red')
plt.xticks(rotation = 75)
plt.show()
life = pd.read_excel("data/LifeExpectancyData/IHME_USA_LIFE_EXPECTANCY_1987_2007_Y2011M06D16.XLSX")
display(life.info())
display(life.isna().sum())
display(life)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66087 entries, 0 to 66086 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fips 66087 non-null int64 1 State 66087 non-null object 2 County 66087 non-null object 3 Year 66087 non-null int64 4 Male life expectancy (years) 66087 non-null float64 5 Years behind international frontier (male) 66087 non-null object 6 Female life expectancy (years) 66087 non-null float64 7 Years behind international frontier (female) 66087 non-null object 8 White male life expectancy (years) 66087 non-null float64 9 White female life expectancy (years) 66087 non-null float64 10 Black male life expectancy (years) 15494 non-null float64 11 Black female life expectancy (years) 18362 non-null float64 12 Closest ranked countries for male life expectancy (higher) 65991 non-null object 13 Closest ranked countries for female life expectancy (higher) 66076 non-null object 14 Closest ranked countries for male life expectancy (lower) 66087 non-null object 15 Closest ranked countries for female life expectancy (lower) 66087 non-null object 16 Rank (male) 66087 non-null int64 17 Rank (female) 66087 non-null int64 18 Male life expectancy change 1987 to 2007 (years) 66087 non-null float64 19 Female life expectancy change 1987 to 2007 (years) 66087 non-null float64 20 Male life expectancy change 1987 to 1997 (years) 66087 non-null float64 21 Female life expectancy change 1987 to 1997 (years) 66087 non-null float64 22 Male life expectancy change 1997 to 2007 (years) 66087 non-null float64 23 Female life expectancy change 1997 to 2007 (years) 66087 non-null float64 dtypes: float64(12), int64(4), object(8) memory usage: 12.1+ MB
None
fips 0 State 0 County 0 Year 0 Male life expectancy (years) 0 Years behind international frontier (male) 0 Female life expectancy (years) 0 Years behind international frontier (female) 0 White male life expectancy (years) 0 White female life expectancy (years) 0 Black male life expectancy (years) 50593 Black female life expectancy (years) 47725 Closest ranked countries for male life expectancy (higher) 96 Closest ranked countries for female life expectancy (higher) 11 Closest ranked countries for male life expectancy (lower) 0 Closest ranked countries for female life expectancy (lower) 0 Rank (male) 0 Rank (female) 0 Male life expectancy change 1987 to 2007 (years) 0 Female life expectancy change 1987 to 2007 (years) 0 Male life expectancy change 1987 to 1997 (years) 0 Female life expectancy change 1987 to 1997 (years) 0 Male life expectancy change 1997 to 2007 (years) 0 Female life expectancy change 1997 to 2007 (years) 0 dtype: int64
| fips | State | County | Year | Male life expectancy (years) | Years behind international frontier (male) | Female life expectancy (years) | Years behind international frontier (female) | White male life expectancy (years) | White female life expectancy (years) | ... | Closest ranked countries for male life expectancy (lower) | Closest ranked countries for female life expectancy (lower) | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | ALABAMA | AUTAUGA | 1987 | 69.2 | 32 | 77.4 | 12 | 70.3 | 78.5 | ... | Albania,Bahrain,Guam,Jamaica,Macedonia, the Fo... | Guadeloupe,Israel,Malta,Montenegro,Portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 1001 | ALABAMA | AUTAUGA | 1988 | 69.3 | 32 | 77.3 | 14 | 70.5 | 78.4 | ... | Albania,Jamaica,Macedonia, the Former Yugoslav... | Barbados,Cuba,Mayotte,Reunion,Slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 1001 | ALABAMA | AUTAUGA | 1989 | 69.8 | 25 | 77.5 | 14 | 71.0 | 78.6 | ... | Albania,Bahrain,Guam,Macedonia, the Former Yug... | Barbados,Mayotte,Reunion,Singapore,Slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 1001 | ALABAMA | AUTAUGA | 1990 | 69.7 | 27 | 77.7 | 14 | 71.0 | 78.7 | ... | Albania,French Guiana,Macedonia, the Former Yu... | Barbados,Mayotte,Netherlands Antilles,Reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 1001 | ALABAMA | AUTAUGA | 1991 | 70.0 | 25 | 77.7 | 15 | 71.2 | 78.8 | ... | French Guiana,Macedonia, the Former Yugoslav R... | Barbados,Kuwait,Netherlands Antilles,Singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66082 | 56045 | WYOMING | WESTON | 2003 | 75.0 | 12 | 80.5 | 14 | 74.9 | 80.5 | ... | Brunei Darussalam,Chile,Finland,Korea, Republi... | Cuba,Denmark,Greece,Mayotte,Reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66083 | 56045 | WYOMING | WESTON | 2004 | 75.4 | 11 | 80.6 | 15 | 75.4 | 80.5 | ... | Chile,Denmark,Finland,Guadeloupe,Virgin Island... | Cuba,Denmark,Kuwait,Mayotte,Reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66084 | 56045 | WYOMING | WESTON | 2005 | 75.4 | 12 | 80.7 | 15 | 75.4 | 80.7 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,French Guiana,Mayotte,Reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66085 | 56045 | WYOMING | WESTON | 2006 | 75.6 | 12 | 81.1 | 14 | 75.6 | 81.1 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Cuba,Denmark,Greece,Mayotte,Reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66086 | 56045 | WYOMING | WESTON | 2007 | 75.9 | 12 | 81.5 | 13 | 75.9 | 81.5 | ... | Brunei Darussalam,Chile,Korea, Republic of,Por... | Channel Islands,Greece,Malta,United Kingdom,Un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
66087 rows × 24 columns
lifemissings = life[['Black male life expectancy (years)', 'Black female life expectancy (years)', 'Closest ranked countries for male life expectancy (higher)', 'Closest ranked countries for female life expectancy (higher)']]
display(lifemissings)
| Black male life expectancy (years) | Black female life expectancy (years) | Closest ranked countries for male life expectancy (higher) | Closest ranked countries for female life expectancy (higher) | |
|---|---|---|---|---|
| 0 | NaN | 73.4 | Chile,Guadeloupe,Panama,United Arab Emirates,V... | Austria,Costa Rica,Netherlands Antilles,New Ze... |
| 1 | 64.3 | 73.3 | Bahrain,Chile,Guam,Panama,United Arab Emirates | Guadeloupe,Ireland,Israel,Malta,Netherlands An... |
| 2 | 64.7 | 73.5 | Belize,Chile,Portugal,Puerto Rico,United Arab ... | Ireland,Israel,Malta,Netherlands Antilles,Port... |
| 3 | 64.4 | 73.6 | Bahrain,Belize,Chile,Guam,Puerto Rico | Costa Rica,Denmark,Ireland,Malta,Portugal |
| 4 | 64.5 | 73.5 | Bahrain,Belize,Chile,Guam,Portugal | Denmark,Ireland,Malta,Mayotte,Reunion |
| ... | ... | ... | ... | ... |
| 66082 | NaN | NaN | Belgium,Denmark,Guadeloupe,Luxembourg,Virgin I... | Costa Rica,Ireland,Slovenia,United Kingdom,Uni... |
| 66083 | NaN | NaN | Belgium,Cuba,Kuwait,Luxembourg,Martinique | Costa Rica,Greece,Ireland,Slovenia,United States |
| 66084 | NaN | NaN | Denmark,Finland,Guadeloupe,Kuwait,Virgin Islan... | Channel Islands,Costa Rica,Greece,Malta,United... |
| 66085 | NaN | NaN | Denmark,Finland,Guadeloupe,Kuwait,Virgin Islan... | Channel Islands,Costa Rica,Malta,United Kingdo... |
| 66086 | NaN | NaN | Costa Rica,Denmark,Finland,Guadeloupe,Kuwait | Chile,Netherlands,Portugal,Slovenia,Virgin Isl... |
66087 rows × 4 columns
for col in lifemissings.columns:
if col == 'Closest ranked countries for male life expectancy (higher)' or col == 'Closest ranked countries for femmale life expectancy (higher)':
lifemissings[col].replace(np.nan, 'none', inplace = True)
else :
lifemissings[col].replace(np.nan,0,inplace=True)
display(lifemissings.info())
display(lifemissings)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66087 entries, 0 to 66086 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Black male life expectancy (years) 66087 non-null float64 1 Black female life expectancy (years) 66087 non-null float64 2 Closest ranked countries for male life expectancy (higher) 66087 non-null object 3 Closest ranked countries for female life expectancy (higher) 66087 non-null object dtypes: float64(2), object(2) memory usage: 2.0+ MB
None
| Black male life expectancy (years) | Black female life expectancy (years) | Closest ranked countries for male life expectancy (higher) | Closest ranked countries for female life expectancy (higher) | |
|---|---|---|---|---|
| 0 | 0.0 | 73.4 | Chile,Guadeloupe,Panama,United Arab Emirates,V... | Austria,Costa Rica,Netherlands Antilles,New Ze... |
| 1 | 64.3 | 73.3 | Bahrain,Chile,Guam,Panama,United Arab Emirates | Guadeloupe,Ireland,Israel,Malta,Netherlands An... |
| 2 | 64.7 | 73.5 | Belize,Chile,Portugal,Puerto Rico,United Arab ... | Ireland,Israel,Malta,Netherlands Antilles,Port... |
| 3 | 64.4 | 73.6 | Bahrain,Belize,Chile,Guam,Puerto Rico | Costa Rica,Denmark,Ireland,Malta,Portugal |
| 4 | 64.5 | 73.5 | Bahrain,Belize,Chile,Guam,Portugal | Denmark,Ireland,Malta,Mayotte,Reunion |
| ... | ... | ... | ... | ... |
| 66082 | 0.0 | 0.0 | Belgium,Denmark,Guadeloupe,Luxembourg,Virgin I... | Costa Rica,Ireland,Slovenia,United Kingdom,Uni... |
| 66083 | 0.0 | 0.0 | Belgium,Cuba,Kuwait,Luxembourg,Martinique | Costa Rica,Greece,Ireland,Slovenia,United States |
| 66084 | 0.0 | 0.0 | Denmark,Finland,Guadeloupe,Kuwait,Virgin Islan... | Channel Islands,Costa Rica,Greece,Malta,United... |
| 66085 | 0.0 | 0.0 | Denmark,Finland,Guadeloupe,Kuwait,Virgin Islan... | Channel Islands,Costa Rica,Malta,United Kingdo... |
| 66086 | 0.0 | 0.0 | Costa Rica,Denmark,Finland,Guadeloupe,Kuwait | Chile,Netherlands,Portugal,Slovenia,Virgin Isl... |
66087 rows × 4 columns
lifemissings.isna().sum()
Black male life expectancy (years) 0 Black female life expectancy (years) 0 Closest ranked countries for male life expectancy (higher) 0 Closest ranked countries for female life expectancy (higher) 0 dtype: int64
for col in life.columns:
if col in (lifemissings.columns):
life[col] = lifemissings[col]
display(life.info())
display(life.isna().sum())
display(life.sample(20))
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66087 entries, 0 to 66086 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fips 66087 non-null int64 1 State 66087 non-null object 2 County 66087 non-null object 3 Year 66087 non-null int64 4 Male life expectancy (years) 66087 non-null float64 5 Years behind international frontier (male) 66087 non-null object 6 Female life expectancy (years) 66087 non-null float64 7 Years behind international frontier (female) 66087 non-null object 8 White male life expectancy (years) 66087 non-null float64 9 White female life expectancy (years) 66087 non-null float64 10 Black male life expectancy (years) 66087 non-null float64 11 Black female life expectancy (years) 66087 non-null float64 12 Closest ranked countries for male life expectancy (higher) 66087 non-null object 13 Closest ranked countries for female life expectancy (higher) 66087 non-null object 14 Closest ranked countries for male life expectancy (lower) 66087 non-null object 15 Closest ranked countries for female life expectancy (lower) 66087 non-null object 16 Rank (male) 66087 non-null int64 17 Rank (female) 66087 non-null int64 18 Male life expectancy change 1987 to 2007 (years) 66087 non-null float64 19 Female life expectancy change 1987 to 2007 (years) 66087 non-null float64 20 Male life expectancy change 1987 to 1997 (years) 66087 non-null float64 21 Female life expectancy change 1987 to 1997 (years) 66087 non-null float64 22 Male life expectancy change 1997 to 2007 (years) 66087 non-null float64 23 Female life expectancy change 1997 to 2007 (years) 66087 non-null float64 dtypes: float64(12), int64(4), object(8) memory usage: 12.1+ MB
None
fips 0 State 0 County 0 Year 0 Male life expectancy (years) 0 Years behind international frontier (male) 0 Female life expectancy (years) 0 Years behind international frontier (female) 0 White male life expectancy (years) 0 White female life expectancy (years) 0 Black male life expectancy (years) 0 Black female life expectancy (years) 0 Closest ranked countries for male life expectancy (higher) 0 Closest ranked countries for female life expectancy (higher) 0 Closest ranked countries for male life expectancy (lower) 0 Closest ranked countries for female life expectancy (lower) 0 Rank (male) 0 Rank (female) 0 Male life expectancy change 1987 to 2007 (years) 0 Female life expectancy change 1987 to 2007 (years) 0 Male life expectancy change 1987 to 1997 (years) 0 Female life expectancy change 1987 to 1997 (years) 0 Male life expectancy change 1997 to 2007 (years) 0 Female life expectancy change 1997 to 2007 (years) 0 dtype: int64
| fips | State | County | Year | Male life expectancy (years) | Years behind international frontier (male) | Female life expectancy (years) | Years behind international frontier (female) | White male life expectancy (years) | White female life expectancy (years) | ... | Closest ranked countries for male life expectancy (lower) | Closest ranked countries for female life expectancy (lower) | Rank (male) | Rank (female) | Male life expectancy change 1987 to 2007 (years) | Female life expectancy change 1987 to 2007 (years) | Male life expectancy change 1987 to 1997 (years) | Female life expectancy change 1987 to 1997 (years) | Male life expectancy change 1997 to 2007 (years) | Female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 65515 | 55133 | WISCONSIN | WAUKESHA | 2003 | 77.8 | 1 | 81.7 | 8 | 77.8 | 81.7 | ... | Canada,Israel,Italy,Macao Special Administrati... | Cyprus,Germany,Guadeloupe,Macao Special Admini... | 26 | 94 | 4.7 | 2.5 | 2.8 | 1.7 | 1.9 | 0.8 |
| 17124 | 19051 | IOWA | DAVIS | 1996 | 73.3 | 13 | 80.2 | 9 | 73.3 | 80.2 | ... | Brunei Darussalam,Denmark,Guadeloupe,Ireland,V... | Austria,Cyprus,Germany,Luxembourg,New Zealand | 1196 | 732 | 2.6 | 1.7 | 1.6 | 1.3 | 1.0 | 0.4 |
| 43381 | 39041 | OHIO | DELAWARE | 2003 | 76.7 | 5 | 80.8 | 12 | 76.8 | 80.9 | ... | France,Malta,Netherlands,Spain,United Kingdom | Costa Rica,Ireland,Slovenia,United Kingdom,Uni... | 145 | 513 | 4.8 | 2.3 | 3.4 | 2.1 | 1.4 | 0.2 |
| 5298 | 8014 | COLORADO | BROOMFIELD | 1993 | 75.1 | 1 | 80.6 | 3 | 75.1 | 80.6 | ... | Australia,Canada,Cyprus,Greece,Switzerland | Guadeloupe,Macao Special Administrative Region... | 67 | 194 | 3.8 | 1.7 | 2.0 | 1.0 | 1.8 | 0.7 |
| 29935 | 28047 | MISSISSIPPI | HARRISON | 1997 | 70.5 | 27 | 77.4 | 22 | 71.7 | 78.4 | ... | Korea, Republic of,Mayotte,Qatar,Reunion,Uruguay | Argentina,French Guiana,Panama,Poland,Slovakia | 2657 | 2802 | 1.6 | 1.4 | 1.1 | 0.2 | 0.5 | 1.2 |
| 56071 | 48291 | TEXAS | LIBERTY | 1988 | 68.6 | 36 | 77.2 | 14 | 69.5 | 77.9 | ... | Bulgaria,Czech Republic,French Guiana,Saint Lu... | Barbados,Cuba,Mayotte,Reunion,Slovenia | 2833 | 2729 | 2.5 | 0.0 | 1.4 | -0.2 | 1.1 | 0.2 |
| 2028 | 2290 | ALASKA | YUKON-KOYUKUK | 1999 | 74.3 | 11 | 79.6 | 15 | 73.7 | 79.1 | ... | Denmark,Finland,Guadeloupe,Ireland,Virgin Isla... | Chile,Greece,Mayotte,Reunion,Slovenia | 988 | 1451 | 4.3 | 2.1 | 1.8 | 1.1 | 2.5 | 1.0 |
| 61201 | 51195 | VIRGINIA | WISE | 1994 | 69.5 | 35 | 77.8 | 18 | 69.7 | 77.9 | ... | Croatia,Korea, Republic of,New Caledonia,Saint... | Barbados,Brunei Darussalam,Chile,Kuwait,Nether... | 2766 | 2625 | 2.0 | -0.2 | 1.1 | 0.2 | 0.9 | -0.4 |
| 54735 | 48163 | TEXAS | FRIO | 1996 | 70.3 | 27 | 78.3 | 18 | 70.7 | 78.3 | ... | Croatia,Korea, Republic of,Macedonia, the Form... | Barbados,Brunei Darussalam,Denmark,Korea, Repu... | 2667 | 2427 | 3.2 | 0.7 | 1.0 | 0.4 | 2.2 | 0.3 |
| 49247 | 45055 | SOUTH CAROLINA | KERSHAW | 1989 | 69.6 | 29 | 77.9 | 12 | 71.4 | 79.4 | ... | Albania,Jamaica,Macedonia, the Former Yugoslav... | Costa Rica,Denmark,Israel,Malta,Portugal | 2605 | 2433 | 3.4 | 1.0 | 1.5 | 0.4 | 1.9 | 0.6 |
| 27554 | 26161 | MICHIGAN | WASHTENAW | 1989 | 73.5 | 5 | 79.1 | 8 | 74.0 | 79.5 | ... | Channel Islands,Costa Rica,Italy,Norway,Spain | Cyprus,Finland,Macao Special Administrative Re... | 208 | 1311 | 5.4 | 3.0 | 3.4 | 1.8 | 2.0 | 1.2 |
| 27321 | 26139 | MICHIGAN | OTTAWA | 1987 | 73.7 | 2 | 80.1 | 1 | 73.6 | 80.0 | ... | Canada,Macao Special Administrative Region of ... | Australia,France,Hong Kong Special Administrat... | 65 | 126 | 5.1 | 2.3 | 2.6 | 1.4 | 2.5 | 0.9 |
| 388 | 1037 | ALABAMA | COOSA | 1997 | 70.4 | 27 | 78.1 | 20 | 71.6 | 78.9 | ... | Albania,Macedonia, the Former Yugoslav Republi... | Brunei Darussalam,Cuba,Netherlands Antilles,Un... | 2693 | 2582 | 1.7 | 0.0 | 1.7 | 0.8 | 0.0 | -0.8 |
| 27477 | 26153 | MICHIGAN | SCHOOLCRAFT | 1996 | 73.1 | 14 | 79.2 | 14 | 73.3 | 79.1 | ... | Brunei Darussalam,Chile,Finland,Montenegro,Vir... | Ireland,Mayotte,Montenegro,Reunion,Singapore | 1418 | 1783 | 3.1 | 1.1 | 1.8 | 0.3 | 1.3 | 0.8 |
| 29434 | 27173 | MINNESOTA | YELLOW MEDICINE | 2000 | 74.8 | 10 | 81.4 | 6 | 74.8 | 81.3 | ... | Belgium,Cuba,Denmark,Guadeloupe,Luxembourg | Austria,Finland,Guadeloupe,Macao Special Admin... | 732 | 104 | 3.0 | 1.0 | 2.0 | 1.1 | 1.0 | -0.1 |
| 16518 | 18177 | INDIANA | WAYNE | 1999 | 73.1 | 17 | 78.9 | 19 | 73.4 | 79.1 | ... | Bahrain,Barbados,Belize,Montenegro,Portugal | Barbados,Brunei Darussalam,Cuba,Denmark,Kuwait | 1835 | 2087 | 2.4 | 1.3 | 2.1 | 0.5 | 0.3 | 0.8 |
| 42588 | 38073 | NORTH DAKOTA | RANSOM | 1987 | 72.5 | 8 | 79.9 | 1 | 72.5 | 79.9 | ... | Channel Islands,Kuwait,Martinique,Montenegro,U... | Australia,Hong Kong Special Administrative Reg... | 624 | 190 | 3.4 | 1.4 | 1.2 | 0.6 | 2.2 | 0.8 |
| 61515 | 51610 | VIRGINIA | FALLS CHURCH | 1993 | 74.1 | 6 | 80.4 | 5 | 74.5 | 80.7 | ... | Kuwait,Martinique,Singapore,Spain,United Kingdom | Belgium,Guadeloupe,Macao Special Administrativ... | 302 | 360 | 6.2 | 3.6 | 2.3 | 1.6 | 3.9 | 2.0 |
| 54722 | 48161 | TEXAS | FREESTONE | 2004 | 73.0 | 22 | 78.7 | 24 | 74.1 | 79.5 | ... | Albania,Czech Republic,Guam,Mexico,Panama | Argentina,Croatia,Mexico,Slovakia,United Arab ... | 2141 | 2198 | 2.7 | 0.8 | 0.7 | 0.2 | 2.0 | 0.6 |
| 13512 | 17095 | ILLINOIS | KNOX | 1996 | 73.4 | 12 | 79.7 | 12 | 73.9 | 79.9 | ... | Brunei Darussalam,Denmark,Guadeloupe,Ireland,V... | Channel Islands,Costa Rica,Greece,United Kingd... | 1119 | 1296 | 2.2 | 0.2 | 1.9 | 0.8 | 0.3 | -0.6 |
20 rows × 24 columns
life.columns = life.columns.str.lower()
life = life.applymap(lambda s : s.lower() if type(s)==str else s)
life
| fips | state | county | year | male life expectancy (years) | years behind international frontier (male) | female life expectancy (years) | years behind international frontier (female) | white male life expectancy (years) | white female life expectancy (years) | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | alabama | autauga | 1987 | 69.2 | 32 | 77.4 | 12 | 70.3 | 78.5 | ... | albania,bahrain,guam,jamaica,macedonia, the fo... | guadeloupe,israel,malta,montenegro,portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 1001 | alabama | autauga | 1988 | 69.3 | 32 | 77.3 | 14 | 70.5 | 78.4 | ... | albania,jamaica,macedonia, the former yugoslav... | barbados,cuba,mayotte,reunion,slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 1001 | alabama | autauga | 1989 | 69.8 | 25 | 77.5 | 14 | 71.0 | 78.6 | ... | albania,bahrain,guam,macedonia, the former yug... | barbados,mayotte,reunion,singapore,slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 1001 | alabama | autauga | 1990 | 69.7 | 27 | 77.7 | 14 | 71.0 | 78.7 | ... | albania,french guiana,macedonia, the former yu... | barbados,mayotte,netherlands antilles,reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 1001 | alabama | autauga | 1991 | 70.0 | 25 | 77.7 | 15 | 71.2 | 78.8 | ... | french guiana,macedonia, the former yugoslav r... | barbados,kuwait,netherlands antilles,singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66082 | 56045 | wyoming | weston | 2003 | 75.0 | 12 | 80.5 | 14 | 74.9 | 80.5 | ... | brunei darussalam,chile,finland,korea, republi... | cuba,denmark,greece,mayotte,reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66083 | 56045 | wyoming | weston | 2004 | 75.4 | 11 | 80.6 | 15 | 75.4 | 80.5 | ... | chile,denmark,finland,guadeloupe,virgin island... | cuba,denmark,kuwait,mayotte,reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66084 | 56045 | wyoming | weston | 2005 | 75.4 | 12 | 80.7 | 15 | 75.4 | 80.7 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,french guiana,mayotte,reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66085 | 56045 | wyoming | weston | 2006 | 75.6 | 12 | 81.1 | 14 | 75.6 | 81.1 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,greece,mayotte,reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66086 | 56045 | wyoming | weston | 2007 | 75.9 | 12 | 81.5 | 13 | 75.9 | 81.5 | ... | brunei darussalam,chile,korea, republic of,por... | channel islands,greece,malta,united kingdom,un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
66087 rows × 24 columns
aggregated = life.groupby(['county', 'year']).mean()
aggregated
| fips | male life expectancy (years) | female life expectancy (years) | white male life expectancy (years) | white female life expectancy (years) | black male life expectancy (years) | black female life expectancy (years) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| county | year | |||||||||||||||
| abbeville | 1987 | 45001.0 | 68.7 | 76.9 | 70.9 | 78.9 | 64.8 | 73.5 | 2805.0 | 2814.0 | 3.3 | 1.2 | 2.2 | 0.8 | 1.1 | 0.4 |
| 1988 | 45001.0 | 68.7 | 77.0 | 71.0 | 79.0 | 64.8 | 73.6 | 2803.0 | 2792.0 | 3.3 | 1.2 | 2.2 | 0.8 | 1.1 | 0.4 | |
| 1989 | 45001.0 | 68.8 | 77.1 | 71.2 | 79.1 | 64.8 | 73.6 | 2818.0 | 2791.0 | 3.3 | 1.2 | 2.2 | 0.8 | 1.1 | 0.4 | |
| 1990 | 45001.0 | 68.8 | 77.1 | 71.2 | 79.2 | 64.7 | 73.6 | 2836.0 | 2798.0 | 3.3 | 1.2 | 2.2 | 0.8 | 1.1 | 0.4 | |
| 1991 | 45001.0 | 69.3 | 77.1 | 71.8 | 79.2 | 65.2 | 73.7 | 2710.0 | 2802.0 | 3.3 | 1.2 | 2.2 | 0.8 | 1.1 | 0.4 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ziebach | 2003 | 46137.0 | 73.2 | 80.0 | 71.4 | 78.7 | 0.0 | 0.0 | 1983.0 | 1092.0 | 2.0 | 0.6 | 1.3 | 0.3 | 0.7 | 0.3 |
| 2004 | 46137.0 | 73.2 | 79.9 | 71.3 | 78.6 | 0.0 | 0.0 | 2038.0 | 1294.0 | 2.0 | 0.6 | 1.3 | 0.3 | 0.7 | 0.3 | |
| 2005 | 46137.0 | 72.9 | 80.1 | 70.9 | 78.7 | 0.0 | 0.0 | 2217.0 | 1223.0 | 2.0 | 0.6 | 1.3 | 0.3 | 0.7 | 0.3 | |
| 2006 | 46137.0 | 72.6 | 79.8 | 70.6 | 78.4 | 0.0 | 0.0 | 2353.0 | 1580.0 | 2.0 | 0.6 | 1.3 | 0.3 | 0.7 | 0.3 | |
| 2007 | 46137.0 | 72.9 | 79.9 | 70.7 | 78.4 | 0.0 | 0.0 | 2281.0 | 1576.0 | 2.0 | 0.6 | 1.3 | 0.3 | 0.7 | 0.3 |
38703 rows × 15 columns
house = pd.read_csv("data/USElectionResults19762020/1976-2020-house.csv")
display(house.info())
display(house.isna().sum())
display(house)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 31103 entries, 0 to 31102 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 31103 non-null int64 1 state 31103 non-null object 2 state_po 31103 non-null object 3 state_fips 31103 non-null int64 4 state_cen 31103 non-null int64 5 state_ic 31103 non-null int64 6 office 31103 non-null object 7 district 31103 non-null int64 8 stage 31103 non-null object 9 runoff 22447 non-null object 10 special 31103 non-null bool 11 candidate 31103 non-null object 12 party 27483 non-null object 13 writein 31103 non-null bool 14 mode 31103 non-null object 15 candidatevotes 31103 non-null int64 16 totalvotes 31103 non-null int64 17 unofficial 31103 non-null bool 18 version 31103 non-null int64 19 fusion_ticket 31103 non-null bool dtypes: bool(4), int64(8), object(8) memory usage: 3.9+ MB
None
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 district 0 stage 0 runoff 8656 special 0 candidate 0 party 3620 writein 0 mode 0 candidatevotes 0 totalvotes 0 unofficial 0 version 0 fusion_ticket 0 dtype: int64
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | runoff | special | candidate | party | writein | mode | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US HOUSE | 1 | GEN | False | False | BILL DAVENPORT | DEMOCRAT | False | TOTAL | 58906 | 157170 | False | 20220331 | False |
| 1 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US HOUSE | 1 | GEN | False | False | JACK EDWARDS | REPUBLICAN | False | TOTAL | 98257 | 157170 | False | 20220331 | False |
| 2 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US HOUSE | 1 | GEN | False | False | WRITEIN | NaN | True | TOTAL | 7 | 157170 | False | 20220331 | False |
| 3 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US HOUSE | 2 | GEN | False | False | J CAROLE KEAHEY | DEMOCRAT | False | TOTAL | 66288 | 156362 | False | 20220331 | False |
| 4 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US HOUSE | 2 | GEN | False | False | WILLIAM L "BILL" DICKINSON | REPUBLICAN | False | TOTAL | 90069 | 156362 | False | 20220331 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | WYOMING | WY | 56 | 83 | 68 | US HOUSE | 0 | GEN | False | False | LYNNETTE GREY BULL | DEMOCRAT | False | TOTAL | 66576 | 278503 | False | 20220331 | False |
| 31099 | 2020 | WYOMING | WY | 56 | 83 | 68 | US HOUSE | 0 | GEN | False | False | OVERVOTES | NaN | False | TOTAL | 1274 | 278503 | False | 20220331 | False |
| 31100 | 2020 | WYOMING | WY | 56 | 83 | 68 | US HOUSE | 0 | GEN | False | False | RICHARD BRUBAKER | LIBERTARIAN | False | TOTAL | 10154 | 278503 | False | 20220331 | False |
| 31101 | 2020 | WYOMING | WY | 56 | 83 | 68 | US HOUSE | 0 | GEN | False | False | UNDERVOTES | NaN | False | TOTAL | 6337 | 278503 | False | 20220331 | False |
| 31102 | 2020 | WYOMING | WY | 56 | 83 | 68 | US HOUSE | 0 | GEN | False | False | WRITEIN | NaN | True | TOTAL | 525 | 278503 | False | 20220331 | False |
31103 rows × 20 columns
house.replace({np.nan:'unknown', '':'unknown'}, regex=True,inplace=True)
house.isna().sum()
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 district 0 stage 0 runoff 0 special 0 candidate 0 party 0 writein 0 mode 0 candidatevotes 0 totalvotes 0 unofficial 0 version 0 fusion_ticket 0 dtype: int64
house.columns = house.columns.str.lower()
house = house.applymap(lambda s: s.lower() if type(s)==str else s)
display(house.info())
display(house)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 31103 entries, 0 to 31102 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 31103 non-null int64 1 state 31103 non-null object 2 state_po 31103 non-null object 3 state_fips 31103 non-null int64 4 state_cen 31103 non-null int64 5 state_ic 31103 non-null int64 6 office 31103 non-null object 7 district 31103 non-null int64 8 stage 31103 non-null object 9 runoff 31103 non-null object 10 special 31103 non-null bool 11 candidate 31103 non-null object 12 party 31103 non-null object 13 writein 31103 non-null bool 14 mode 31103 non-null object 15 candidatevotes 31103 non-null int64 16 totalvotes 31103 non-null int64 17 unofficial 31103 non-null bool 18 version 31103 non-null int64 19 fusion_ticket 31103 non-null bool dtypes: bool(4), int64(8), object(8) memory usage: 3.9+ MB
None
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | runoff | special | candidate | party | writein | mode | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | bill davenport | democrat | False | total | 58906 | 157170 | False | 20220331 | False |
| 1 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | jack edwards | republican | False | total | 98257 | 157170 | False | 20220331 | False |
| 2 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | writein | unknown | True | total | 7 | 157170 | False | 20220331 | False |
| 3 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | j carole keahey | democrat | False | total | 66288 | 156362 | False | 20220331 | False |
| 4 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | william l "bill" dickinson | republican | False | total | 90069 | 156362 | False | 20220331 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | lynnette grey bull | democrat | False | total | 66576 | 278503 | False | 20220331 | False |
| 31099 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | overvotes | unknown | False | total | 1274 | 278503 | False | 20220331 | False |
| 31100 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | richard brubaker | libertarian | False | total | 10154 | 278503 | False | 20220331 | False |
| 31101 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | undervotes | unknown | False | total | 6337 | 278503 | False | 20220331 | False |
| 31102 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | writein | unknown | True | total | 525 | 278503 | False | 20220331 | False |
31103 rows × 20 columns
plt.figure(figsize=(20,20))
sns.heatmap(house.corr(), annot=True)
plt.title("House features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
plt.figure(figsize=(25,25))
sns.pairplot(house, corner=True)
plt.show()
<string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility.
<Figure size 1800x1800 with 0 Axes>
president = pd.read_csv("data/USElectionResults19762020/1976-2020-president.csv")
display(president.info())
display(president.isna().sum())
display(president)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4287 entries, 0 to 4286 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 4287 non-null int64 1 state 4287 non-null object 2 state_po 4287 non-null object 3 state_fips 4287 non-null int64 4 state_cen 4287 non-null int64 5 state_ic 4287 non-null int64 6 office 4287 non-null object 7 candidate 4000 non-null object 8 party_detailed 3831 non-null object 9 writein 4284 non-null object 10 candidatevotes 4287 non-null int64 11 totalvotes 4287 non-null int64 12 version 4287 non-null int64 13 notes 0 non-null float64 14 party_simplified 4287 non-null object dtypes: float64(1), int64(7), object(7) memory usage: 502.5+ KB
None
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 candidate 287 party_detailed 456 writein 3 candidatevotes 0 totalvotes 0 version 0 notes 4287 party_simplified 0 dtype: int64
| year | state | state_po | state_fips | state_cen | state_ic | office | candidate | party_detailed | writein | candidatevotes | totalvotes | version | notes | party_simplified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US PRESIDENT | CARTER, JIMMY | DEMOCRAT | False | 659170 | 1182850 | 20210113 | NaN | DEMOCRAT |
| 1 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US PRESIDENT | FORD, GERALD | REPUBLICAN | False | 504070 | 1182850 | 20210113 | NaN | REPUBLICAN |
| 2 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US PRESIDENT | MADDOX, LESTER | AMERICAN INDEPENDENT PARTY | False | 9198 | 1182850 | 20210113 | NaN | OTHER |
| 3 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US PRESIDENT | BUBAR, BENJAMIN ""BEN"" | PROHIBITION | False | 6669 | 1182850 | 20210113 | NaN | OTHER |
| 4 | 1976 | ALABAMA | AL | 1 | 63 | 41 | US PRESIDENT | HALL, GUS | COMMUNIST PARTY USE | False | 1954 | 1182850 | 20210113 | NaN | OTHER |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4282 | 2020 | WYOMING | WY | 56 | 83 | 68 | US PRESIDENT | JORGENSEN, JO | LIBERTARIAN | False | 5768 | 278503 | 20210113 | NaN | LIBERTARIAN |
| 4283 | 2020 | WYOMING | WY | 56 | 83 | 68 | US PRESIDENT | PIERCE, BROCK | INDEPENDENT | False | 2208 | 278503 | 20210113 | NaN | OTHER |
| 4284 | 2020 | WYOMING | WY | 56 | 83 | 68 | US PRESIDENT | NaN | NaN | True | 1739 | 278503 | 20210113 | NaN | OTHER |
| 4285 | 2020 | WYOMING | WY | 56 | 83 | 68 | US PRESIDENT | OVERVOTES | NaN | False | 279 | 278503 | 20210113 | NaN | OTHER |
| 4286 | 2020 | WYOMING | WY | 56 | 83 | 68 | US PRESIDENT | UNDERVOTES | NaN | False | 1459 | 278503 | 20210113 | NaN | OTHER |
4287 rows × 15 columns
president.drop(columns=['notes'], inplace=True) #all nan feature
president.replace({'': 'unknown', np.nan:'unknown'},regex=True, inplace=True)
display(president.isna().sum())
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 candidate 0 party_detailed 0 writein 0 candidatevotes 0 totalvotes 0 version 0 party_simplified 0 dtype: int64
president.columns = president.columns.str.lower()
president = president.applymap(lambda s: s.lower() if type(s)==str else s)
plt.figure(figsize=(20,20))
sns.heatmap(president.corr(), annot=True)
plt.title("President features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
plt.figure(figsize=(25,25))
sns.pairplot(president, corner=True)
plt.show()
<Figure size 1800x1800 with 0 Axes>
senate = pd.read_csv("data/USElectionResults19762020/1976-2020-senate.csv")
display(senate.info())
display(senate.isna().sum())
display(senate)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3629 entries, 0 to 3628 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 3629 non-null int64 1 state 3629 non-null object 2 state_po 3629 non-null object 3 state_fips 3629 non-null int64 4 state_cen 3629 non-null int64 5 state_ic 3629 non-null int64 6 office 3629 non-null object 7 district 3629 non-null object 8 stage 3629 non-null object 9 special 3629 non-null bool 10 candidate 3206 non-null object 11 party_detailed 3030 non-null object 12 writein 3629 non-null bool 13 mode 3629 non-null object 14 candidatevotes 3629 non-null int64 15 totalvotes 3629 non-null int64 16 unofficial 3629 non-null bool 17 version 3629 non-null int64 18 party_simplified 3629 non-null object dtypes: bool(3), int64(7), object(9) memory usage: 464.4+ KB
None
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 district 0 stage 0 special 0 candidate 423 party_detailed 599 writein 0 mode 0 candidatevotes 0 totalvotes 0 unofficial 0 version 0 party_simplified 0 dtype: int64
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | special | candidate | party_detailed | writein | mode | candidatevotes | totalvotes | unofficial | version | party_simplified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | SAM STEIGER | REPUBLICAN | False | total | 321236 | 741210 | False | 20210114 | REPUBLICAN |
| 1 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | WM. MATHEWS FEIGHAN | INDEPENDENT | False | total | 1565 | 741210 | False | 20210114 | OTHER |
| 2 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | DENNIS DECONCINI | DEMOCRAT | False | total | 400334 | 741210 | False | 20210114 | DEMOCRAT |
| 3 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | ALLAN NORWITZ | LIBERTARIAN | False | total | 7310 | 741210 | False | 20210114 | LIBERTARIAN |
| 4 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | BOB FIELD | INDEPENDENT | False | total | 10765 | 741210 | False | 20210114 | OTHER |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3624 | 2020 | WYOMING | WY | 56 | 83 | 68 | US SENATE | statewide | gen | False | UNDER VOTES | NaN | False | total | 6401 | 278503 | False | 20210114 | OTHER |
| 3625 | 2021 | GEORGIA | GA | 13 | 58 | 44 | US SENATE | statewide | runoff | True | KELLY LOEFFLER | REPUBLICAN | False | total | 2194848 | 4483294 | True | 20210114 | REPUBLICAN |
| 3626 | 2021 | GEORGIA | GA | 13 | 58 | 44 | US SENATE | statewide | runoff | True | RAPHAEL WARNOCK | DEMOCRAT | False | total | 2288446 | 4483294 | True | 20210114 | DEMOCRAT |
| 3627 | 2021 | GEORGIA | GA | 13 | 58 | 44 | US SENATE | statewide | runoff | False | DAVID A. PERDUE | REPUBLICAN | False | total | 2213979 | 4483241 | True | 20210114 | REPUBLICAN |
| 3628 | 2021 | GEORGIA | GA | 13 | 58 | 44 | US SENATE | statewide | runoff | False | JON OSSOFF | DEMOCRAT | False | total | 2269262 | 4483241 | True | 20210114 | DEMOCRAT |
3629 rows × 19 columns
senate.replace({np.nan:'unknown', '':'unknown'},regex=True, inplace=True)
senate.isna().sum()
year 0 state 0 state_po 0 state_fips 0 state_cen 0 state_ic 0 office 0 district 0 stage 0 special 0 candidate 0 party_detailed 0 writein 0 mode 0 candidatevotes 0 totalvotes 0 unofficial 0 version 0 party_simplified 0 dtype: int64
plt.figure(figsize=(20,20))
sns.heatmap(senate.corr(), annot=True)
plt.title("Senate features correlation matrix", fontsize=15, color='red')
plt.xticks(rotation = 90)
plt.show()
plt.figure(figsize=(25,25))
sns.pairplot(senate, corner=True)
plt.show()
<string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility. <string>:6: RuntimeWarning: Converting input from bool to <class 'numpy.uint8'> for compatibility.
<Figure size 1800x1800 with 0 Axes>
1 - We explored the data and imported it in pandas dataframes, and check the info to see the data types, the isna.sum to check for missing values or nulls in each column and displayed data in table.
2- Missing values was imputed either by simply replace missing values with 0 if numeric, or with 'none' or 'unknown' if string. Or we used KNNImputer to fill the missing gaps.
3- In order to fit KNN we used copies of the dataframe and factorized into numeric columns imputed the missing values and replaced the affected columns in place of the original ones.
4- Correlation matrix used to figure out highly correlated features to be dropped, we found in two dataframes that we had correlation of 1 or >0.95, there is no added benefit so in these cases we kept only one column of each group.
5- Visualized pairplot to see a more clear picture of distrubtion, and used scatterplot, countplot in addition to heatmap and tables.
6- Used groupby and displayed the mean of the numeric columns.
Now, all the data is preprocessed and ready for the next sections.
# merge the three dataframes we were asked to use
sectionB = pd.merge(countyData, pd.merge(complete,life))
display(sectionB.info())
display(sectionB)
<class 'pandas.core.frame.DataFrame'> Int64Index: 61110 entries, 0 to 61109 Columns: 214 entries, fips to female life expectancy change 1997 to 2007 (years) dtypes: float64(198), int64(6), object(10) memory usage: 100.2+ MB
None
| fips | state | county | land area | water area | date | population | name | smoking_ban_2010 | pop2000 | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 | autauga county | none | 43671.0 | ... | albania,bahrain,guam,jamaica,macedonia, the fo... | guadeloupe,israel,malta,montenegro,portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 | autauga county | none | 43671.0 | ... | albania,jamaica,macedonia, the former yugoslav... | barbados,cuba,mayotte,reunion,slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 | autauga county | none | 43671.0 | ... | albania,bahrain,guam,macedonia, the former yug... | barbados,mayotte,reunion,singapore,slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 | autauga county | none | 43671.0 | ... | albania,french guiana,macedonia, the former yu... | barbados,mayotte,netherlands antilles,reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 | autauga county | none | 43671.0 | ... | french guiana,macedonia, the former yugoslav r... | barbados,kuwait,netherlands antilles,singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61105 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,finland,korea, republi... | cuba,denmark,greece,mayotte,reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 61106 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | chile,denmark,finland,guadeloupe,virgin island... | cuba,denmark,kuwait,mayotte,reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 61107 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,french guiana,mayotte,reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 61108 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,greece,mayotte,reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 61109 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | channel islands,greece,malta,united kingdom,un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
61110 rows × 214 columns
# We can use only : california, florida, south dakota and wyoming, so we will filter state to include only this states
sectionB = sectionB[sectionB['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
display(sectionB.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 4263 entries, 3276 to 61109 Columns: 214 entries, fips to female life expectancy change 1997 to 2007 (years) dtypes: float64(198), int64(6), object(10) memory usage: 7.0+ MB
None
Create a copy dataframe with all string columns factorized into numeric values (needed in order to scale the data and to fit dimensionality reduction algorithms)
sectionBI = sectionB.reset_index(drop=True)
sectionBI
| fips | state | county | land area | water area | date | population | name | smoking_ban_2010 | pop2000 | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6001 | california | alameda | 1910.1 | 216.0 | 1990 | 1279182 | alameda county | none | 1443741.0 | ... | barbados,germany,ireland,netherlands antilles,... | austria,denmark,germany,united kingdom,virgin ... | 1444 | 2262 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 1 | 6001 | california | alameda | 1910.1 | 216.0 | 1990 | 1279182 | alameda county | none | 1443741.0 | ... | aruba,austria,netherlands antilles,singapore,u... | costa rica,denmark,montenegro,new zealand,virg... | 1538 | 2310 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 2 | 6001 | california | alameda | 1910.1 | 216.0 | 1990 | 1279182 | alameda county | none | 1443741.0 | ... | aruba,finland,guadeloupe,luxembourg,virgin isl... | costa rica,denmark,guadeloupe,montenegro,new z... | 1709 | 2212 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 3 | 6001 | california | alameda | 1910.1 | 216.0 | 1990 | 1279182 | alameda county | none | 1443741.0 | ... | finland,guadeloupe,luxembourg,netherlands anti... | costa rica,israel,montenegro,new zealand,portugal | 1547 | 2186 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 4 | 6001 | california | alameda | 1910.1 | 216.0 | 1990 | 1279182 | alameda county | none | 1443741.0 | ... | barbados,guadeloupe,luxembourg,netherlands ant... | costa rica,israel,montenegro,new zealand,unite... | 1515 | 2002 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,finland,korea, republi... | cuba,denmark,greece,mayotte,reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4259 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | chile,denmark,finland,guadeloupe,virgin island... | cuba,denmark,kuwait,mayotte,reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4260 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,french guiana,mayotte,reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4261 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,greece,mayotte,reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4262 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 | weston county | none | 6644.0 | ... | brunei darussalam,chile,korea, republic of,por... | channel islands,greece,malta,united kingdom,un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
4263 rows × 214 columns
sectionBI['state'].unique()
array(['california', 'florida', 'south dakota', 'wyoming'], dtype=object)
nsectionBI = sectionBI.copy()
for col in nsectionBI.columns:
if nsectionBI[col].dtype == 'object':
nsectionBI[col] = pd.factorize(nsectionBI[col])[0]
nsectionBI
| fips | state | county | land area | water area | date | population | name | smoking_ban_2010 | pop2000 | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6001 | 0 | 0 | 1910.1 | 216.0 | 1990 | 1279182 | 0 | 0 | 1443741.0 | ... | 0 | 0 | 1444 | 2262 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 1 | 6001 | 0 | 0 | 1910.1 | 216.0 | 1990 | 1279182 | 0 | 0 | 1443741.0 | ... | 1 | 1 | 1538 | 2310 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 2 | 6001 | 0 | 0 | 1910.1 | 216.0 | 1990 | 1279182 | 0 | 0 | 1443741.0 | ... | 2 | 2 | 1709 | 2212 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 3 | 6001 | 0 | 0 | 1910.1 | 216.0 | 1990 | 1279182 | 0 | 0 | 1443741.0 | ... | 3 | 3 | 1547 | 2186 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 4 | 6001 | 0 | 0 | 1910.1 | 216.0 | 1990 | 1279182 | 0 | 0 | 1443741.0 | ... | 4 | 4 | 1515 | 2002 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | 56045 | 3 | 193 | 6210.6 | 5.7 | 1990 | 6518 | 193 | 0 | 6644.0 | ... | 293 | 42 | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4259 | 56045 | 3 | 193 | 6210.6 | 5.7 | 1990 | 6518 | 193 | 0 | 6644.0 | ... | 324 | 30 | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4260 | 56045 | 3 | 193 | 6210.6 | 5.7 | 1990 | 6518 | 193 | 0 | 6644.0 | ... | 204 | 31 | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4261 | 56045 | 3 | 193 | 6210.6 | 5.7 | 1990 | 6518 | 193 | 0 | 6644.0 | ... | 204 | 42 | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4262 | 56045 | 3 | 193 | 6210.6 | 5.7 | 1990 | 6518 | 193 | 0 | 6644.0 | ... | 566 | 69 | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
4263 rows × 214 columns
nsectionBI = pd.DataFrame(StandardScaler().fit_transform(nsectionBI), columns=nsectionBI.columns)
nsectionBI
| fips | state | county | land area | water area | date | population | name | smoking_ban_2010 | pop2000 | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.994517 | -1.292797 | -1.695422 | -0.542879 | 0.031266 | 0.0 | 1.557546 | -1.695422 | -0.932869 | 1.591993 | ... | -1.283402 | -1.120590 | 0.272777 | 1.270062 | 2.108264 | 2.168689 | 1.028268 | 1.886193 | 2.508414 | 1.834447 |
| 1 | -0.994517 | -1.292797 | -1.695422 | -0.542879 | 0.031266 | 0.0 | 1.557546 | -1.695422 | -0.932869 | 1.591993 | ... | -1.277623 | -1.113691 | 0.383143 | 1.323672 | 2.108264 | 2.168689 | 1.028268 | 1.886193 | 2.508414 | 1.834447 |
| 2 | -0.994517 | -1.292797 | -1.695422 | -0.542879 | 0.031266 | 0.0 | 1.557546 | -1.695422 | -0.932869 | 1.591993 | ... | -1.271844 | -1.106792 | 0.583915 | 1.214217 | 2.108264 | 2.168689 | 1.028268 | 1.886193 | 2.508414 | 1.834447 |
| 3 | -0.994517 | -1.292797 | -1.695422 | -0.542879 | 0.031266 | 0.0 | 1.557546 | -1.695422 | -0.932869 | 1.591993 | ... | -1.266065 | -1.099893 | 0.393710 | 1.185178 | 2.108264 | 2.168689 | 1.028268 | 1.886193 | 2.508414 | 1.834447 |
| 4 | -0.994517 | -1.292797 | -1.695422 | -0.542879 | 0.031266 | 0.0 | 1.557546 | -1.695422 | -0.932869 | 1.591993 | ... | -1.260286 | -1.092995 | 0.356139 | 0.979670 | 2.108264 | 2.168689 | 1.028268 | 1.886193 | 2.508414 | 1.834447 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | 1.536376 | 1.782646 | 1.743077 | 0.390691 | -0.354885 | 0.0 | -0.266271 | 1.743077 | -0.932869 | -0.284350 | ... | 0.409871 | -0.830844 | -0.299011 | -0.419794 | 0.000000 | -0.034734 | 0.222846 | -1.361937 | -0.227058 | 0.838411 |
| 4259 | 1.536376 | 1.782646 | 1.743077 | 0.390691 | -0.354885 | 0.0 | -0.266271 | 1.743077 | -0.932869 | -0.284350 | ... | 0.589022 | -0.913628 | -0.508001 | -0.353897 | 0.000000 | -0.034734 | 0.222846 | -1.361937 | -0.227058 | 0.838411 |
| 4260 | 1.536376 | 1.782646 | 1.743077 | 0.390691 | -0.354885 | 0.0 | -0.266271 | 1.743077 | -0.932869 | -0.284350 | ... | -0.104468 | -0.906730 | -0.385895 | -0.427612 | 0.000000 | -0.034734 | 0.222846 | -1.361937 | -0.227058 | 0.838411 |
| 4261 | 1.536376 | 1.782646 | 1.743077 | 0.390691 | -0.354885 | 0.0 | -0.266271 | 1.743077 | -0.932869 | -0.284350 | ... | -0.104468 | -0.830844 | -0.432859 | -0.627536 | 0.000000 | -0.034734 | 0.222846 | -1.361937 | -0.227058 | 0.838411 |
| 4262 | 1.536376 | 1.782646 | 1.743077 | 0.390691 | -0.354885 | 0.0 | -0.266271 | 1.743077 | -0.932869 | -0.284350 | ... | 1.987561 | -0.644579 | -0.569055 | -0.817407 | 0.000000 | -0.034734 | 0.222846 | -1.361937 | -0.227058 | 0.838411 |
4263 rows × 214 columns
pca = PCA(n_components=20).fit(nsectionBI)
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals= 1)
labels = ['PC' + str(x) for x in range(1,len(per_var)+1)]
npcaDF = pd.DataFrame(pca.transform(nsectionBI), columns=labels)
npcaDF
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 | PC18 | PC19 | PC20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20.078847 | -3.148031 | -7.355897 | 3.444966 | -6.396216 | -1.810544 | 0.934945 | 7.104521 | 0.704144 | 1.961503 | -4.614112 | -0.593923 | 1.856628 | 1.635280 | 0.697579 | 0.692138 | -0.352161 | -0.810567 | 0.596144 | 1.046048 |
| 1 | 20.074478 | -3.124059 | -7.352466 | 3.415184 | -6.431199 | -1.832794 | 0.929761 | 7.091101 | 0.701512 | 1.956618 | -4.580999 | -0.523129 | 1.788868 | 1.603075 | 0.658649 | 0.659437 | -0.393194 | -0.824488 | 0.557209 | 1.041088 |
| 2 | 20.072625 | -3.128731 | -7.343302 | 3.419709 | -6.435340 | -1.811738 | 0.984852 | 7.078249 | 0.701469 | 1.916766 | -4.534777 | -0.460299 | 1.735658 | 1.560922 | 0.598274 | 0.617784 | -0.408606 | -0.847924 | 0.494980 | 1.062441 |
| 3 | 20.089584 | -3.217743 | -7.374150 | 3.470125 | -6.394723 | -1.738035 | 1.112069 | 7.021074 | 0.719006 | 1.844027 | -4.389529 | -0.219927 | 1.653719 | 1.469248 | 0.446238 | 0.547608 | -0.418714 | -0.872242 | 0.425404 | 1.061322 |
| 4 | 20.103518 | -3.306641 | -7.390837 | 3.543311 | -6.314850 | -1.647529 | 1.230007 | 6.987476 | 0.741491 | 1.760574 | -4.299363 | -0.055473 | 1.647802 | 1.402108 | 0.308695 | 0.496037 | -0.384707 | -0.886394 | 0.351279 | 1.077347 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | -4.805125 | -3.782610 | 3.450969 | 0.572084 | 1.286960 | -0.693996 | -0.752482 | 0.088054 | -2.284431 | 0.917229 | -0.488780 | 2.353817 | -0.026286 | 0.669192 | -1.144722 | -1.567654 | 1.161919 | 0.419524 | 0.434167 | -0.147288 |
| 4259 | -4.818992 | -3.815261 | 3.483526 | 0.621961 | 1.215029 | -0.746443 | -0.694070 | 0.096296 | -2.408408 | 1.134688 | -0.325914 | 2.369398 | -0.125097 | 0.666340 | -1.406332 | -1.959919 | 2.449471 | 0.366497 | -0.412810 | 0.220758 |
| 4260 | -4.755571 | -3.933844 | 3.348024 | 0.732584 | 1.637802 | -0.449386 | -0.769171 | 0.156947 | -2.161975 | 0.839664 | -0.552770 | 2.368450 | 0.629596 | 0.629467 | -1.824425 | -1.554066 | 0.974357 | 0.132866 | -0.131789 | -0.214248 |
| 4261 | -4.739401 | -4.045749 | 3.329472 | 0.813473 | 1.720586 | -0.339760 | -0.602255 | 0.097849 | -2.134566 | 0.721955 | -0.426531 | 2.595002 | 0.581795 | 0.538655 | -1.954174 | -1.606795 | 0.977439 | 0.130973 | -0.164600 | -0.202732 |
| 4262 | -4.820685 | -4.018260 | 3.511165 | 0.715126 | 1.138003 | -0.639271 | -0.186445 | -0.159633 | -2.454920 | 0.907928 | 0.191974 | 3.150977 | -0.891470 | 0.355781 | -1.493693 | -2.320315 | 3.009618 | 0.512628 | -0.496973 | 0.465930 |
4263 rows × 20 columns
npcaDF = pd.concat([sectionBI[['state']], npcaDF], axis = 1)
npcaDF
| state | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | ... | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 | PC18 | PC19 | PC20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | california | 20.078847 | -3.148031 | -7.355897 | 3.444966 | -6.396216 | -1.810544 | 0.934945 | 7.104521 | 0.704144 | ... | -4.614112 | -0.593923 | 1.856628 | 1.635280 | 0.697579 | 0.692138 | -0.352161 | -0.810567 | 0.596144 | 1.046048 |
| 1 | california | 20.074478 | -3.124059 | -7.352466 | 3.415184 | -6.431199 | -1.832794 | 0.929761 | 7.091101 | 0.701512 | ... | -4.580999 | -0.523129 | 1.788868 | 1.603075 | 0.658649 | 0.659437 | -0.393194 | -0.824488 | 0.557209 | 1.041088 |
| 2 | california | 20.072625 | -3.128731 | -7.343302 | 3.419709 | -6.435340 | -1.811738 | 0.984852 | 7.078249 | 0.701469 | ... | -4.534777 | -0.460299 | 1.735658 | 1.560922 | 0.598274 | 0.617784 | -0.408606 | -0.847924 | 0.494980 | 1.062441 |
| 3 | california | 20.089584 | -3.217743 | -7.374150 | 3.470125 | -6.394723 | -1.738035 | 1.112069 | 7.021074 | 0.719006 | ... | -4.389529 | -0.219927 | 1.653719 | 1.469248 | 0.446238 | 0.547608 | -0.418714 | -0.872242 | 0.425404 | 1.061322 |
| 4 | california | 20.103518 | -3.306641 | -7.390837 | 3.543311 | -6.314850 | -1.647529 | 1.230007 | 6.987476 | 0.741491 | ... | -4.299363 | -0.055473 | 1.647802 | 1.402108 | 0.308695 | 0.496037 | -0.384707 | -0.886394 | 0.351279 | 1.077347 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | wyoming | -4.805125 | -3.782610 | 3.450969 | 0.572084 | 1.286960 | -0.693996 | -0.752482 | 0.088054 | -2.284431 | ... | -0.488780 | 2.353817 | -0.026286 | 0.669192 | -1.144722 | -1.567654 | 1.161919 | 0.419524 | 0.434167 | -0.147288 |
| 4259 | wyoming | -4.818992 | -3.815261 | 3.483526 | 0.621961 | 1.215029 | -0.746443 | -0.694070 | 0.096296 | -2.408408 | ... | -0.325914 | 2.369398 | -0.125097 | 0.666340 | -1.406332 | -1.959919 | 2.449471 | 0.366497 | -0.412810 | 0.220758 |
| 4260 | wyoming | -4.755571 | -3.933844 | 3.348024 | 0.732584 | 1.637802 | -0.449386 | -0.769171 | 0.156947 | -2.161975 | ... | -0.552770 | 2.368450 | 0.629596 | 0.629467 | -1.824425 | -1.554066 | 0.974357 | 0.132866 | -0.131789 | -0.214248 |
| 4261 | wyoming | -4.739401 | -4.045749 | 3.329472 | 0.813473 | 1.720586 | -0.339760 | -0.602255 | 0.097849 | -2.134566 | ... | -0.426531 | 2.595002 | 0.581795 | 0.538655 | -1.954174 | -1.606795 | 0.977439 | 0.130973 | -0.164600 | -0.202732 |
| 4262 | wyoming | -4.820685 | -4.018260 | 3.511165 | 0.715126 | 1.138003 | -0.639271 | -0.186445 | -0.159633 | -2.454920 | ... | 0.191974 | 3.150977 | -0.891470 | 0.355781 | -1.493693 | -2.320315 | 3.009618 | 0.512628 | -0.496973 | 0.465930 |
4263 rows × 21 columns
PCA scatter plot graph by state:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = npcaDF['state'] == target
ax.scatter(npcaDF.loc[indicesToKeep, 'PC1']
, npcaDF.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
Top features used in PCA :
loading_scores = pd.Series(pca.components_[0], index= sectionBI.columns)
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
top10Features = sorted_loading_scores[0:10].index.values
print(loading_scores[top10Features])
households_2019 0.112029 pop_2019 0.112009 pop2017 0.111987 households_2017 0.111973 pop2016 0.111936 employed_2015 0.111926 civilian_labor_force_2016 0.111910 civilian_labor_force_2017 0.111897 civilian_labor_force_2015 0.111888 pop2015 0.111882 dtype: float64
Least important features in PCA:
bottom10Features = sorted_loading_scores[-10:].index.values
print(loading_scores[bottom10Features])
poverty_2017 -5.193086e-03 years behind international frontier (male) -4.596984e-03 percent_change_private_nonfarm_employment_2009 -4.361116e-03 age_under_5_2017 4.272801e-03 hispanic_owned_firms_2007 -2.907996e-03 poverty_age_under_18_2017 -1.610122e-03 age_under_5_2019 6.553234e-04 year 5.613648e-04 age_over_18_2019 3.424495e-04 date 2.019484e-28 dtype: float64
As we had 214 columns, the top pcs covers only ~23% which means that is not good enough. So, we are going to analyze by PCA for each dataset alone figure out top features and create a new dataframe by merging only top features.
County_Complete:
complete
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | autauga county | none | 1001.0 | 43671.0 | 54571.0 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | ... | 23.200000 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | alabama | baldwin county | none | 1003.0 | 140415.0 | 182265.0 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | ... | 13.400000 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | alabama | barbour county | partial | 1005.0 | 29038.0 | 27457.0 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | ... | 50.100000 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | alabama | bibb county | none | 1007.0 | 20826.0 | 22915.0 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | ... | 34.113333 | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | alabama | blount county | none | 1009.0 | 51024.0 | 57322.0 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | ... | 18.400000 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | wyoming | sweetwater county | none | 56037.0 | 37613.0 | 43806.0 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | ... | 20.110000 | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | wyoming | teton county | partial | 56039.0 | 18251.0 | 21294.0 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | ... | 20.320000 | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | wyoming | uinta county | none | 56041.0 | 19742.0 | 21118.0 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | ... | 25.250000 | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | wyoming | washakie county | none | 56043.0 | 8289.0 | 8533.0 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | ... | 33.856667 | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | wyoming | weston county | none | 56045.0 | 6644.0 | 7208.0 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | ... | 34.710000 | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
3142 rows × 188 columns
fcomplete = complete.copy()
fcomplete = fcomplete[fcomplete['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
fcomplete
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 186 | california | alameda county | none | 6001.0 | 1443741.0 | 1510271.0 | 1532215.0 | 1556648.0 | 1582936.0 | 1611572.0 | ... | 11.300000 | 6.5 | 4.4 | 4.4 | 1.0 | 2.1 | 1.9 | 3.7 | 40.5 | 31.4 |
| 187 | california | alpine county | none | 6003.0 | 1208.0 | 1175.0 | 1093.0 | 1110.0 | 1127.0 | 1084.0 | ... | 25.233333 | 5.1 | 15.6 | 8.9 | 0.0 | 3.7 | 0.0 | 9.4 | 57.7 | 53.0 |
| 188 | california | amador county | none | 6005.0 | 35100.0 | 38091.0 | 37539.0 | 37112.0 | 36635.0 | 36748.0 | ... | 16.486667 | 4.8 | 6.6 | 4.7 | 0.0 | 3.4 | 5.1 | 11.7 | 86.7 | 78.2 |
| 189 | california | butte county | none | 6007.0 | 203171.0 | 220000.0 | 220003.0 | 220969.0 | 221768.0 | 223629.0 | ... | 19.600000 | 6.1 | 7.7 | 6.2 | 0.3 | 3.2 | 3.3 | 7.9 | 81.6 | 72.0 |
| 190 | california | calaveras county | none | 6009.0 | 40554.0 | 45578.0 | 45163.0 | 44826.0 | 44667.0 | 44677.0 | ... | 23.593333 | 5.2 | 6.6 | 4.4 | 0.1 | 0.7 | 0.0 | 11.9 | 90.5 | 80.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3137 | wyoming | sweetwater county | none | 56037.0 | 37613.0 | 43806.0 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | ... | 20.110000 | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 3138 | wyoming | teton county | partial | 56039.0 | 18251.0 | 21294.0 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | ... | 20.320000 | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 3139 | wyoming | uinta county | none | 56041.0 | 19742.0 | 21118.0 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | ... | 25.250000 | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 3140 | wyoming | washakie county | none | 56043.0 | 8289.0 | 8533.0 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | ... | 33.856667 | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 3141 | wyoming | weston county | none | 56045.0 | 6644.0 | 7208.0 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | ... | 34.710000 | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
214 rows × 188 columns
fcomplete.reset_index(drop=True,inplace=True)
fcomplete
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | california | alameda county | none | 6001.0 | 1443741.0 | 1510271.0 | 1532215.0 | 1556648.0 | 1582936.0 | 1611572.0 | ... | 11.300000 | 6.5 | 4.4 | 4.4 | 1.0 | 2.1 | 1.9 | 3.7 | 40.5 | 31.4 |
| 1 | california | alpine county | none | 6003.0 | 1208.0 | 1175.0 | 1093.0 | 1110.0 | 1127.0 | 1084.0 | ... | 25.233333 | 5.1 | 15.6 | 8.9 | 0.0 | 3.7 | 0.0 | 9.4 | 57.7 | 53.0 |
| 2 | california | amador county | none | 6005.0 | 35100.0 | 38091.0 | 37539.0 | 37112.0 | 36635.0 | 36748.0 | ... | 16.486667 | 4.8 | 6.6 | 4.7 | 0.0 | 3.4 | 5.1 | 11.7 | 86.7 | 78.2 |
| 3 | california | butte county | none | 6007.0 | 203171.0 | 220000.0 | 220003.0 | 220969.0 | 221768.0 | 223629.0 | ... | 19.600000 | 6.1 | 7.7 | 6.2 | 0.3 | 3.2 | 3.3 | 7.9 | 81.6 | 72.0 |
| 4 | california | calaveras county | none | 6009.0 | 40554.0 | 45578.0 | 45163.0 | 44826.0 | 44667.0 | 44677.0 | ... | 23.593333 | 5.2 | 6.6 | 4.4 | 0.1 | 0.7 | 0.0 | 11.9 | 90.5 | 80.9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 209 | wyoming | sweetwater county | none | 56037.0 | 37613.0 | 43806.0 | 44013.0 | 45042.0 | 45145.0 | 44981.0 | ... | 20.110000 | 2.3 | 5.7 | 11.3 | 0.5 | 9.2 | 13.8 | 8.6 | 93.4 | 79.6 |
| 210 | wyoming | teton county | partial | 56039.0 | 18251.0 | 21294.0 | 21476.0 | 21709.0 | 22326.0 | 22817.0 | ... | 20.320000 | 0.7 | 0.7 | 12.7 | 0.0 | 10.1 | 5.9 | 5.3 | 89.3 | 81.3 |
| 211 | wyoming | uinta county | none | 56041.0 | 19742.0 | 21118.0 | 20899.0 | 20999.0 | 20960.0 | 20845.0 | ... | 25.250000 | 3.5 | 5.5 | 11.2 | 0.6 | 6.8 | 1.8 | 7.4 | 93.4 | 87.5 |
| 212 | wyoming | washakie county | none | 56043.0 | 8289.0 | 8533.0 | 8460.0 | 8421.0 | 8427.0 | 8288.0 | ... | 33.856667 | 3.8 | 4.1 | 15.0 | 1.5 | 7.0 | 7.8 | 11.9 | 89.7 | 81.9 |
| 213 | wyoming | weston county | none | 56045.0 | 6644.0 | 7208.0 | 7141.0 | 7074.0 | 7136.0 | 7142.0 | ... | 34.710000 | 1.3 | 4.0 | 11.8 | 0.0 | 8.6 | 7.1 | 10.3 | 97.4 | 96.4 |
214 rows × 188 columns
nfcomplete = fcomplete.copy()
for col in nfcomplete.columns:
if nfcomplete[col].dtype == 'object':
nfcomplete[col] = pd.factorize(nfcomplete[col])[0]
nfcomplete = pd.DataFrame(StandardScaler().fit_transform(nfcomplete), columns=nfcomplete.columns)
nfcomplete
| state | name | smoking_ban_2010 | fips | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.287807 | -1.696376 | -0.929794 | -0.995546 | 1.570722 | 1.527704 | 1.534377 | 1.546827 | 1.562521 | 1.580662 | ... | -1.722633 | 2.095480 | -0.232816 | -1.072087 | 0.319348 | -0.963705 | -0.782206 | -1.726782 | -2.245111 | -1.709094 |
| 1 | -1.287807 | -1.679521 | -0.929794 | -0.995445 | -0.309713 | -0.328699 | -0.329518 | -0.330008 | -0.330612 | -0.331842 | ... | -0.161885 | 1.283942 | 2.746433 | -0.294662 | -0.685660 | -0.621729 | -1.181409 | 0.138051 | -1.257090 | -0.741204 |
| 2 | -1.287807 | -1.662665 | -0.929794 | -0.995343 | -0.265533 | -0.283287 | -0.285151 | -0.286570 | -0.288115 | -0.289490 | ... | -1.141647 | 1.110041 | 0.352394 | -1.020259 | -0.685660 | -0.685850 | -0.109864 | 0.890527 | 0.408758 | 0.388002 |
| 3 | -1.287807 | -1.645809 | -0.929794 | -0.995242 | -0.046441 | -0.059513 | -0.063030 | -0.064737 | -0.066545 | -0.067563 | ... | -0.792905 | 1.863612 | 0.644998 | -0.761117 | -0.384157 | -0.728597 | -0.488056 | -0.352695 | 0.115799 | 0.110182 |
| 4 | -1.287807 | -1.628953 | -0.929794 | -0.995141 | -0.258423 | -0.274077 | -0.275870 | -0.277263 | -0.278502 | -0.280074 | ... | -0.345590 | 1.341909 | 0.352394 | -1.072087 | -0.585159 | -1.262934 | -1.181409 | 0.955960 | 0.627042 | 0.508989 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 209 | 1.797163 | 1.674795 | -0.929794 | 1.539707 | -0.262257 | -0.276257 | -0.277270 | -0.277002 | -0.277930 | -0.279713 | ... | -0.735777 | -0.339134 | 0.112990 | 0.119964 | -0.183156 | 0.553813 | 1.718065 | -0.123680 | 0.793627 | 0.450736 |
| 210 | 1.797163 | 1.691650 | 0.649380 | 1.539808 | -0.287497 | -0.303950 | -0.304705 | -0.305155 | -0.305240 | -0.306033 | ... | -0.712254 | -1.266606 | -1.217032 | 0.361829 | -0.685660 | 0.746175 | 0.058221 | -1.203320 | 0.558110 | 0.526912 |
| 211 | 1.797163 | 1.708506 | -0.929794 | 1.539909 | -0.285553 | -0.304166 | -0.305408 | -0.306011 | -0.306875 | -0.308375 | ... | -0.160018 | 0.356470 | 0.059789 | 0.102688 | -0.082655 | 0.040849 | -0.803217 | -0.516276 | 0.793627 | 0.804733 |
| 212 | 1.797163 | 1.725362 | -0.929794 | 1.540011 | -0.300483 | -0.319647 | -0.320550 | -0.321187 | -0.321875 | -0.323287 | ... | 0.804061 | 0.530371 | -0.312617 | 0.759180 | 0.821852 | 0.083596 | 0.457424 | 0.955960 | 0.581087 | 0.553798 |
| 213 | 1.797163 | 1.742218 | -0.929794 | 1.540112 | -0.302627 | -0.321277 | -0.322156 | -0.322812 | -0.323420 | -0.324648 | ... | 0.899648 | -0.918804 | -0.339218 | 0.206344 | -0.685660 | 0.425572 | 0.310349 | 0.432498 | 1.023399 | 1.203540 |
214 rows × 188 columns
compPCA = PCA(n_components=5).fit(nfcomplete)
per_var = np.round(compPCA.explained_variance_ratio_ * 100, decimals= 1)
labels = ['PC' + str(x) for x in range(1,len(per_var)+1)]
compDF = pd.DataFrame(compPCA.transform(nfcomplete), columns=labels)
compDF
| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| 0 | 18.511053 | -4.096958 | -7.312470 | 5.184999 | -4.413267 |
| 1 | -1.298175 | 0.098800 | -4.032665 | -1.131603 | -1.959474 |
| 2 | -1.566966 | -3.647765 | -4.036988 | -4.664941 | -0.265970 |
| 3 | 1.083668 | 0.507101 | -4.206530 | -1.225903 | 0.238068 |
| 4 | -1.816774 | -2.437155 | -3.264555 | -5.044697 | 0.212645 |
| ... | ... | ... | ... | ... | ... |
| 209 | -2.038192 | -2.857448 | -1.047730 | 4.271761 | 2.177875 |
| 210 | -0.402166 | -8.248352 | -4.227338 | 5.941364 | 0.131405 |
| 211 | -3.093857 | -1.825476 | 0.275134 | 3.270018 | 2.639168 |
| 212 | -4.158668 | -1.832448 | 2.339907 | 1.233536 | 2.383258 |
| 213 | -4.906390 | -3.313210 | 3.311019 | 0.166198 | 1.086813 |
214 rows × 5 columns
comp_loading_scores = pd.Series(compPCA.components_[0], index= nfcomplete.columns)
sorted_comp_loading_scores = comp_loading_scores.abs().sort_values(ascending=False)
top10Features = sorted_comp_loading_scores[0:10].index.values
print(comp_loading_scores[top10Features])
households_2019 0.114686 employed_2015 0.114685 civilian_labor_force_2016 0.114678 civilian_labor_force_2017 0.114664 civilian_labor_force_2015 0.114659 pop_2019 0.114658 pop2017 0.114644 employed_2017 0.114639 households_2017 0.114638 employed_2016 0.114635 dtype: float64
bottom10Features = sorted_comp_loading_scores[-10:].index.values
print(comp_loading_scores[bottom10Features])
poverty_2016 -0.006072 age_under_5_2017 0.005694 households_speak_other_2019 -0.005034 percent_change_private_nonfarm_employment_2009 -0.004929 poverty_age_under_5_2017 -0.003414 uninsured_2017 -0.003142 age_under_5_2019 0.001684 poverty_age_under_18_2017 0.001509 poverty_2017 -0.001373 age_over_18_2019 0.000339 dtype: float64
compDF = pd.concat([fcomplete[['state']], compDF], axis = 1)
compDF
| state | PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|---|
| 0 | california | 18.511053 | -4.096958 | -7.312470 | 5.184999 | -4.413267 |
| 1 | california | -1.298175 | 0.098800 | -4.032665 | -1.131603 | -1.959474 |
| 2 | california | -1.566966 | -3.647765 | -4.036988 | -4.664941 | -0.265970 |
| 3 | california | 1.083668 | 0.507101 | -4.206530 | -1.225903 | 0.238068 |
| 4 | california | -1.816774 | -2.437155 | -3.264555 | -5.044697 | 0.212645 |
| ... | ... | ... | ... | ... | ... | ... |
| 209 | wyoming | -2.038192 | -2.857448 | -1.047730 | 4.271761 | 2.177875 |
| 210 | wyoming | -0.402166 | -8.248352 | -4.227338 | 5.941364 | 0.131405 |
| 211 | wyoming | -3.093857 | -1.825476 | 0.275134 | 3.270018 | 2.639168 |
| 212 | wyoming | -4.158668 | -1.832448 | 2.339907 | 1.233536 | 2.383258 |
| 213 | wyoming | -4.906390 | -3.313210 | 3.311019 | 0.166198 | 1.086813 |
214 rows × 6 columns
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA - County Complete', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = compDF['state'] == target
ax.scatter(compDF.loc[indicesToKeep, 'PC1']
, compDF.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
CountyData :
countyData
| fips | state | county | land area | water area | date | population | |
|---|---|---|---|---|---|---|---|
| 0 | 1001 | alabama | autauga | 1543.7 | 22.0 | 1990 | 34222 |
| 1 | 1003 | alabama | baldwin | 4135.0 | 1115.1 | 1990 | 98280 |
| 2 | 1005 | alabama | barbour | 2292.1 | 50.7 | 1990 | 25417 |
| 3 | 1007 | alabama | bibb | 1611.9 | 8.1 | 1990 | 16576 |
| 4 | 1009 | alabama | blount | 1672.3 | 12.9 | 1990 | 39248 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2936 | 56037 | wyoming | sweetwater | 27003.0 | 170.0 | 1990 | 38823 |
| 2937 | 56039 | wyoming | teton | 10380.6 | 554.3 | 1990 | 11172 |
| 2938 | 56041 | wyoming | uinta | 5391.7 | 15.3 | 1990 | 18705 |
| 2939 | 56043 | wyoming | washakie | 5802.0 | 7.0 | 1990 | 8388 |
| 2940 | 56045 | wyoming | weston | 6210.6 | 5.7 | 1990 | 6518 |
2941 rows × 7 columns
ncd = countyData.copy()
ncd = ncd[ncd['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
ncd.reset_index(drop=True,inplace=True)
ncd = countyData.copy()
for col in ncd.columns:
if ncd[col].dtype == 'object':
ncd[col] = pd.factorize(ncd[col])[0]
ncd = pd.DataFrame(StandardScaler().fit_transform(ncd), columns=ncd.columns)
ncd
| fips | state | county | land area | water area | date | population | |
|---|---|---|---|---|---|---|---|
| 0 | -1.942674 | -1.698598 | -1.246086 | -0.290689 | -0.188071 | 0.0 | -0.158631 |
| 1 | -1.942541 | -1.698598 | -1.244089 | 0.489865 | 2.430236 | 0.0 | 0.082336 |
| 2 | -1.942407 | -1.698598 | -1.242093 | -0.065255 | -0.119325 | 0.0 | -0.191753 |
| 3 | -1.942274 | -1.698598 | -1.240097 | -0.270145 | -0.221365 | 0.0 | -0.225010 |
| 4 | -1.942141 | -1.698598 | -1.238100 | -0.251952 | -0.209868 | 0.0 | -0.139725 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2936 | 1.730594 | 1.734236 | 2.139931 | 7.378187 | 0.166434 | 0.0 | -0.141324 |
| 2937 | 1.730728 | 1.734236 | -0.363645 | 2.371171 | 1.086949 | 0.0 | -0.245339 |
| 2938 | 1.730861 | 1.734236 | 2.141928 | 0.868410 | -0.204119 | 0.0 | -0.217002 |
| 2939 | 1.730995 | 1.734236 | 2.143924 | 0.992001 | -0.224000 | 0.0 | -0.255811 |
| 2940 | 1.731128 | 1.734236 | 2.145921 | 1.115080 | -0.227114 | 0.0 | -0.262846 |
2941 rows × 7 columns
cdpca = PCA(n_components=7).fit(ncd)
per_var2 = np.round(cdpca.explained_variance_ratio_ * 100, decimals= 1)
labels2 = ['PC' + str(x) for x in range(1,len(per_var2)+1)]
ncdDF = pd.DataFrame(cdpca.transform(ncd), columns = labels2)
ncdDF
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | |
|---|---|---|---|---|---|---|---|
| 0 | 2.851242 | -0.381841 | -0.029324 | -0.091771 | -0.204587 | 0.174538 | 0.0 |
| 1 | 2.827894 | 1.749168 | -0.538437 | -1.727197 | 0.012878 | 0.171434 | 0.0 |
| 2 | 2.839862 | -0.257559 | 0.144689 | -0.194737 | -0.190478 | 0.173063 | 0.0 |
| 3 | 2.844112 | -0.435015 | 0.016563 | -0.122713 | -0.210183 | 0.174337 | 0.0 |
| 4 | 2.846768 | -0.365216 | 0.010291 | -0.069592 | -0.210832 | 0.173827 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2936 | -3.428057 | 3.458611 | 6.311159 | -1.210326 | -0.236897 | -0.043185 | 0.0 |
| 2937 | -2.042035 | 1.531687 | 1.546166 | -1.062887 | 1.721279 | -0.014380 | 0.0 |
| 2938 | -3.220436 | 0.220521 | 0.788318 | -0.053591 | -0.624537 | -0.000281 | 0.0 |
| 2939 | -3.227423 | 0.239826 | 0.913031 | -0.088046 | -0.620369 | -0.001072 | 0.0 |
| 2940 | -3.232830 | 0.289512 | 1.023572 | -0.109695 | -0.615370 | -0.001957 | 0.0 |
2941 rows × 7 columns
states = countyData[countyData['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
states.reset_index(drop=True,inplace=True)
ncdDF2 = pd.concat([states[['state']], ncdDF], axis = 1)
ncdDF2
| state | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | |
|---|---|---|---|---|---|---|---|---|
| 0 | california | 2.851242 | -0.381841 | -0.029324 | -0.091771 | -0.204587 | 0.174538 | 0.0 |
| 1 | california | 2.827894 | 1.749168 | -0.538437 | -1.727197 | 0.012878 | 0.171434 | 0.0 |
| 2 | california | 2.839862 | -0.257559 | 0.144689 | -0.194737 | -0.190478 | 0.173063 | 0.0 |
| 3 | california | 2.844112 | -0.435015 | 0.016563 | -0.122713 | -0.210183 | 0.174337 | 0.0 |
| 4 | california | 2.846768 | -0.365216 | 0.010291 | -0.069592 | -0.210832 | 0.173827 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2936 | NaN | -3.428057 | 3.458611 | 6.311159 | -1.210326 | -0.236897 | -0.043185 | 0.0 |
| 2937 | NaN | -2.042035 | 1.531687 | 1.546166 | -1.062887 | 1.721279 | -0.014380 | 0.0 |
| 2938 | NaN | -3.220436 | 0.220521 | 0.788318 | -0.053591 | -0.624537 | -0.000281 | 0.0 |
| 2939 | NaN | -3.227423 | 0.239826 | 0.913031 | -0.088046 | -0.620369 | -0.001072 | 0.0 |
| 2940 | NaN | -3.232830 | 0.289512 | 1.023572 | -0.109695 | -0.615370 | -0.001957 | 0.0 |
2941 rows × 8 columns
ncdDF2.state.unique()
array(['california', 'florida', 'south dakota', 'wyoming', nan],
dtype=object)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA - CountyData', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming', np.nan]
colors = ['r', 'g', 'b','y', 'black']
for target, color in zip(targets,colors):
indicesToKeep = ncdDF2['state'] == target
ax.scatter(ncdDF2.loc[indicesToKeep, 'PC1']
, ncdDF2.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
loading_scores3 = pd.Series(cdpca.components_[0], index= ncd.columns)
sorted_loading_scores3 = loading_scores3.abs().sort_values(ascending=False)
top10Features3 = sorted_loading_scores3[0:10].index.values
print(loading_scores3[top10Features3])
fips -0.614235 state -0.613872 county -0.492145 population 0.051024 land area -0.032462 water area -0.003529 date -0.000000 dtype: float64
Life Expectancy :
life
| fips | state | county | year | male life expectancy (years) | years behind international frontier (male) | female life expectancy (years) | years behind international frontier (female) | white male life expectancy (years) | white female life expectancy (years) | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | alabama | autauga | 1987 | 69.2 | 32 | 77.4 | 12 | 70.3 | 78.5 | ... | albania,bahrain,guam,jamaica,macedonia, the fo... | guadeloupe,israel,malta,montenegro,portugal | 2684 | 2661 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 1 | 1001 | alabama | autauga | 1988 | 69.3 | 32 | 77.3 | 14 | 70.5 | 78.4 | ... | albania,jamaica,macedonia, the former yugoslav... | barbados,cuba,mayotte,reunion,slovenia | 2646 | 2691 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 2 | 1001 | alabama | autauga | 1989 | 69.8 | 25 | 77.5 | 14 | 71.0 | 78.6 | ... | albania,bahrain,guam,macedonia, the former yug... | barbados,mayotte,reunion,singapore,slovenia | 2522 | 2637 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 3 | 1001 | alabama | autauga | 1990 | 69.7 | 27 | 77.7 | 14 | 71.0 | 78.7 | ... | albania,french guiana,macedonia, the former yu... | barbados,mayotte,netherlands antilles,reunion,... | 2585 | 2590 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| 4 | 1001 | alabama | autauga | 1991 | 70.0 | 25 | 77.7 | 15 | 71.2 | 78.8 | ... | french guiana,macedonia, the former yugoslav r... | barbados,kuwait,netherlands antilles,singapore... | 2534 | 2616 | 3.7 | 0.6 | 1.5 | 0.8 | 2.2 | -0.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66082 | 56045 | wyoming | weston | 2003 | 75.0 | 12 | 80.5 | 14 | 74.9 | 80.5 | ... | brunei darussalam,chile,finland,korea, republi... | cuba,denmark,greece,mayotte,reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66083 | 56045 | wyoming | weston | 2004 | 75.4 | 11 | 80.6 | 15 | 75.4 | 80.5 | ... | chile,denmark,finland,guadeloupe,virgin island... | cuba,denmark,kuwait,mayotte,reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66084 | 56045 | wyoming | weston | 2005 | 75.4 | 12 | 80.7 | 15 | 75.4 | 80.7 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,french guiana,mayotte,reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66085 | 56045 | wyoming | weston | 2006 | 75.6 | 12 | 81.1 | 14 | 75.6 | 81.1 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,greece,mayotte,reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 66086 | 56045 | wyoming | weston | 2007 | 75.9 | 12 | 81.5 | 13 | 75.9 | 81.5 | ... | brunei darussalam,chile,korea, republic of,por... | channel islands,greece,malta,united kingdom,un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
66087 rows × 24 columns
nlife = life.copy()
nlife = nlife[nlife['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
nlife.reset_index(drop=True,inplace=True)
nlife
| fips | state | county | year | male life expectancy (years) | years behind international frontier (male) | female life expectancy (years) | years behind international frontier (female) | white male life expectancy (years) | white female life expectancy (years) | ... | closest ranked countries for male life expectancy (lower) | closest ranked countries for female life expectancy (lower) | rank (male) | rank (female) | male life expectancy change 1987 to 2007 (years) | female life expectancy change 1987 to 2007 (years) | male life expectancy change 1987 to 1997 (years) | female life expectancy change 1987 to 1997 (years) | male life expectancy change 1997 to 2007 (years) | female life expectancy change 1997 to 2007 (years) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6001 | california | alameda | 1987 | 71.6 | 12 | 78.1 | 10 | 72.1 | 78.6 | ... | barbados,germany,ireland,netherlands antilles,... | austria,denmark,germany,united kingdom,virgin ... | 1444 | 2262 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 1 | 6001 | california | alameda | 1988 | 71.6 | 12 | 78.0 | 11 | 72.2 | 78.6 | ... | aruba,austria,netherlands antilles,singapore,u... | costa rica,denmark,montenegro,new zealand,virg... | 1538 | 2310 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 2 | 6001 | california | alameda | 1989 | 71.5 | 14 | 78.2 | 11 | 72.1 | 78.8 | ... | aruba,finland,guadeloupe,luxembourg,virgin isl... | costa rica,denmark,guadeloupe,montenegro,new z... | 1709 | 2212 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 3 | 6001 | california | alameda | 1990 | 71.9 | 13 | 78.4 | 12 | 72.5 | 79.0 | ... | finland,guadeloupe,luxembourg,netherlands anti... | costa rica,israel,montenegro,new zealand,portugal | 1547 | 2186 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| 4 | 6001 | california | alameda | 1991 | 72.1 | 13 | 78.7 | 11 | 72.7 | 79.2 | ... | barbados,guadeloupe,luxembourg,netherlands ant... | costa rica,israel,montenegro,new zealand,unite... | 1515 | 2002 | 6.1 | 4.2 | 2.6 | 1.9 | 3.5 | 2.3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4489 | 56045 | wyoming | weston | 2003 | 75.0 | 12 | 80.5 | 14 | 74.9 | 80.5 | ... | brunei darussalam,chile,finland,korea, republi... | cuba,denmark,greece,mayotte,reunion | 957 | 749 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4490 | 56045 | wyoming | weston | 2004 | 75.4 | 11 | 80.6 | 15 | 75.4 | 80.5 | ... | chile,denmark,finland,guadeloupe,virgin island... | cuba,denmark,kuwait,mayotte,reunion | 779 | 808 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4491 | 56045 | wyoming | weston | 2005 | 75.4 | 12 | 80.7 | 15 | 75.4 | 80.7 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,french guiana,mayotte,reunion | 883 | 742 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4492 | 56045 | wyoming | weston | 2006 | 75.6 | 12 | 81.1 | 14 | 75.6 | 81.1 | ... | brunei darussalam,chile,korea, republic of,por... | cuba,denmark,greece,mayotte,reunion | 843 | 563 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
| 4493 | 56045 | wyoming | weston | 2007 | 75.9 | 12 | 81.5 | 13 | 75.9 | 81.5 | ... | brunei darussalam,chile,korea, republic of,por... | channel islands,greece,malta,united kingdom,un... | 727 | 393 | 3.5 | 1.7 | 2.0 | 0.2 | 1.5 | 1.5 |
4494 rows × 24 columns
nnlife = nlife.copy()
for col in nnlife.columns:
if nnlife[col].dtype == 'object':
nnlife[col] = pd.factorize(nnlife[col])[0]
lfscaled = pd.DataFrame(StandardScaler().fit_transform(nnlife), columns= nlife.columns)
lifepca = PCA(n_components=5).fit(lfscaled)
per_var4 = np.round(lifepca.explained_variance_ratio_ * 100, decimals= 1)
labels4 = ['PC' + str(x) for x in range(1,len(per_var4)+1)]
lifePcaDF = pd.DataFrame(lifepca.transform(lfscaled), columns = labels4)
lifePcaDF
| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| 0 | 2.235689 | 4.434735 | -3.614656 | 2.422027 | 1.385105 |
| 1 | 2.177542 | 4.482998 | -3.527317 | 2.328437 | 1.334714 |
| 2 | 2.205513 | 4.462234 | -3.426428 | 2.250725 | 1.289718 |
| 3 | 2.455822 | 4.392868 | -3.207508 | 2.035737 | 1.182018 |
| 4 | 2.711721 | 4.260365 | -3.095504 | 1.891819 | 1.123355 |
| ... | ... | ... | ... | ... | ... |
| 4489 | 0.728112 | -3.026469 | 1.222870 | 0.334806 | 0.131530 |
| 4490 | 0.702903 | -3.162869 | 1.527003 | 0.855603 | -1.138693 |
| 4491 | 1.269618 | -3.301538 | 0.467839 | -0.173868 | -0.053236 |
| 4492 | 1.585737 | -3.449993 | 0.673219 | -0.356272 | -0.075756 |
| 4493 | 1.173348 | -3.379952 | 2.879027 | 0.645889 | -1.646874 |
4494 rows × 5 columns
lifePcaDF = pd.concat([nlife[['state']], lifePcaDF], axis = 1)
lifePcaDF
| state | PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|---|
| 0 | california | 2.235689 | 4.434735 | -3.614656 | 2.422027 | 1.385105 |
| 1 | california | 2.177542 | 4.482998 | -3.527317 | 2.328437 | 1.334714 |
| 2 | california | 2.205513 | 4.462234 | -3.426428 | 2.250725 | 1.289718 |
| 3 | california | 2.455822 | 4.392868 | -3.207508 | 2.035737 | 1.182018 |
| 4 | california | 2.711721 | 4.260365 | -3.095504 | 1.891819 | 1.123355 |
| ... | ... | ... | ... | ... | ... | ... |
| 4489 | wyoming | 0.728112 | -3.026469 | 1.222870 | 0.334806 | 0.131530 |
| 4490 | wyoming | 0.702903 | -3.162869 | 1.527003 | 0.855603 | -1.138693 |
| 4491 | wyoming | 1.269618 | -3.301538 | 0.467839 | -0.173868 | -0.053236 |
| 4492 | wyoming | 1.585737 | -3.449993 | 0.673219 | -0.356272 | -0.075756 |
| 4493 | wyoming | 1.173348 | -3.379952 | 2.879027 | 0.645889 | -1.646874 |
4494 rows × 6 columns
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA - Life Expectancy ', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming', np.nan]
colors = ['r', 'g', 'b','y', 'black']
for target, color in zip(targets,colors):
indicesToKeep = lifePcaDF['state'] == target
ax.scatter(lifePcaDF.loc[indicesToKeep, 'PC1']
, lifePcaDF.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
loading_scores5 = pd.Series(lifepca.components_[0], index= nlife.columns)
sorted_loading_scores5 = loading_scores5.abs().sort_values(ascending=False)
top10Features5 = sorted_loading_scores5[0:10].index.values
print(loading_scores5[top10Features5])
rank (male) -0.301151 white female life expectancy (years) 0.298200 male life expectancy (years) 0.292480 white male life expectancy (years) 0.289211 female life expectancy change 1987 to 2007 (years) 0.288658 female life expectancy (years) 0.286506 rank (female) -0.271495 male life expectancy change 1987 to 2007 (years) 0.268578 female life expectancy change 1987 to 1997 (years) 0.251781 female life expectancy change 1997 to 2007 (years) 0.246571 dtype: float64
topDF = pd.merge(states[['fips', 'state', 'county']],
pd.merge(nlife[['fips', 'county', 'state', 'rank (male)', 'white female life expectancy (years)', 'male life expectancy (years)', 'white male life expectancy (years)', 'female life expectancy change 1987 to 2007 (years)']],
fcomplete[['fips', 'state', 'households_2019', 'employed_2015','civilian_labor_force_2016', 'civilian_labor_force_2017', 'civilian_labor_force_2015']]))
topDF
| fips | state | county | rank (male) | white female life expectancy (years) | male life expectancy (years) | white male life expectancy (years) | female life expectancy change 1987 to 2007 (years) | households_2019 | employed_2015 | civilian_labor_force_2016 | civilian_labor_force_2017 | civilian_labor_force_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6001 | california | alameda | 1444 | 78.6 | 71.6 | 72.1 | 4.2 | 577177.0 | 784217.0 | 837583.0 | 848335.0 | 823130.0 |
| 1 | 6001 | california | alameda | 1538 | 78.6 | 71.6 | 72.2 | 4.2 | 577177.0 | 784217.0 | 837583.0 | 848335.0 | 823130.0 |
| 2 | 6001 | california | alameda | 1709 | 78.8 | 71.5 | 72.1 | 4.2 | 577177.0 | 784217.0 | 837583.0 | 848335.0 | 823130.0 |
| 3 | 6001 | california | alameda | 1547 | 79.0 | 71.9 | 72.5 | 4.2 | 577177.0 | 784217.0 | 837583.0 | 848335.0 | 823130.0 |
| 4 | 6001 | california | alameda | 1515 | 79.2 | 72.1 | 72.7 | 4.2 | 577177.0 | 784217.0 | 837583.0 | 848335.0 | 823130.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | 56045 | wyoming | weston | 957 | 80.5 | 75.0 | 74.9 | 1.7 | 2891.0 | 3834.0 | 3965.0 | 3771.0 | 3970.0 |
| 4259 | 56045 | wyoming | weston | 779 | 80.5 | 75.4 | 75.4 | 1.7 | 2891.0 | 3834.0 | 3965.0 | 3771.0 | 3970.0 |
| 4260 | 56045 | wyoming | weston | 883 | 80.7 | 75.4 | 75.4 | 1.7 | 2891.0 | 3834.0 | 3965.0 | 3771.0 | 3970.0 |
| 4261 | 56045 | wyoming | weston | 843 | 81.1 | 75.6 | 75.6 | 1.7 | 2891.0 | 3834.0 | 3965.0 | 3771.0 | 3970.0 |
| 4262 | 56045 | wyoming | weston | 727 | 81.5 | 75.9 | 75.9 | 1.7 | 2891.0 | 3834.0 | 3965.0 | 3771.0 | 3970.0 |
4263 rows × 13 columns
ntopDF = topDF.copy()
for col in ntopDF.columns:
if ntopDF[col].dtype == 'object':
ntopDF[col] = pd.factorize(ntopDF[col])[0]
ntopDF = pd.DataFrame(StandardScaler().fit_transform(ntopDF), columns=ntopDF.columns)
ntopDF
| fips | state | county | rank (male) | white female life expectancy (years) | male life expectancy (years) | white male life expectancy (years) | female life expectancy change 1987 to 2007 (years) | households_2019 | employed_2015 | civilian_labor_force_2016 | civilian_labor_force_2017 | civilian_labor_force_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.994517 | -1.292797 | -1.695422 | 0.272777 | -1.089796 | -0.889447 | -0.748580 | 2.168689 | 1.722256 | 1.704367 | 1.685187 | 1.682577 | 1.674377 |
| 1 | -0.994517 | -1.292797 | -1.695422 | 0.383143 | -1.089796 | -0.889447 | -0.699014 | 2.168689 | 1.722256 | 1.704367 | 1.685187 | 1.682577 | 1.674377 |
| 2 | -0.994517 | -1.292797 | -1.695422 | 0.583915 | -0.923744 | -0.937327 | -0.748580 | 2.168689 | 1.722256 | 1.704367 | 1.685187 | 1.682577 | 1.674377 |
| 3 | -0.994517 | -1.292797 | -1.695422 | 0.393710 | -0.757692 | -0.745809 | -0.550317 | 2.168689 | 1.722256 | 1.704367 | 1.685187 | 1.682577 | 1.674377 |
| 4 | -0.994517 | -1.292797 | -1.695422 | 0.356139 | -0.591640 | -0.650050 | -0.451185 | 2.168689 | 1.722256 | 1.704367 | 1.685187 | 1.682577 | 1.674377 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | 1.536376 | 1.782646 | 1.743077 | -0.299011 | 0.487697 | 0.738453 | 0.639263 | -0.034734 | -0.326247 | -0.300324 | -0.300688 | -0.301444 | -0.299778 |
| 4259 | 1.536376 | 1.782646 | 1.743077 | -0.508001 | 0.487697 | 0.929970 | 0.887092 | -0.034734 | -0.326247 | -0.300324 | -0.300688 | -0.301444 | -0.299778 |
| 4260 | 1.536376 | 1.782646 | 1.743077 | -0.385895 | 0.653749 | 0.929970 | 0.887092 | -0.034734 | -0.326247 | -0.300324 | -0.300688 | -0.301444 | -0.299778 |
| 4261 | 1.536376 | 1.782646 | 1.743077 | -0.432859 | 0.985853 | 1.025729 | 0.986224 | -0.034734 | -0.326247 | -0.300324 | -0.300688 | -0.301444 | -0.299778 |
| 4262 | 1.536376 | 1.782646 | 1.743077 | -0.569055 | 1.317957 | 1.169367 | 1.134921 | -0.034734 | -0.326247 | -0.300324 | -0.300688 | -0.301444 | -0.299778 |
4263 rows × 13 columns
toppca = PCA(n_components=10).fit(ntopDF)
per_var6 = np.round(toppca.explained_variance_ratio_ * 100, decimals= 1)
labels6 = ['PC' + str(x) for x in range(1,len(per_var6)+1)]
topPcaDF = pd.DataFrame(toppca.transform(ntopDF), columns = labels6)
topPcaDF
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.082790 | 2.473945 | -0.324879 | 1.912858 | -0.478343 | 0.427408 | 0.456770 | 0.047968 | 0.123221 | 0.069004 |
| 1 | 4.076047 | 2.501004 | -0.326608 | 1.887398 | -0.388557 | 0.480950 | 0.496473 | 0.054592 | 0.148776 | 0.071614 |
| 2 | 4.057014 | 2.557305 | -0.312479 | 1.884809 | -0.176302 | 0.383997 | 0.584514 | 0.136168 | 0.153301 | 0.067797 |
| 3 | 4.167776 | 2.239772 | -0.404217 | 1.761155 | -0.204045 | 0.361948 | 0.555267 | 0.107815 | 0.149130 | 0.065696 |
| 4 | 4.227130 | 2.070157 | -0.453873 | 1.686301 | -0.138405 | 0.308802 | 0.571605 | 0.122374 | 0.148324 | 0.062881 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | -1.176240 | -2.244319 | 1.894844 | -0.126827 | 0.340810 | 0.420539 | 0.014152 | -0.021307 | -0.077132 | -0.013028 |
| 4259 | -1.078254 | -2.519153 | 1.807959 | -0.248485 | 0.234020 | 0.532016 | -0.042302 | -0.104481 | -0.058733 | -0.009092 |
| 4260 | -1.071265 | -2.537821 | 1.800882 | -0.275240 | 0.397949 | 0.449902 | 0.022114 | -0.045023 | -0.055961 | -0.012570 |
| 4261 | -0.987083 | -2.782662 | 1.734909 | -0.363761 | 0.535986 | 0.274890 | 0.056303 | -0.008086 | -0.057960 | -0.019833 |
| 4262 | -0.875445 | -3.107016 | 1.646988 | -0.475398 | 0.618340 | 0.111523 | 0.063126 | 0.003752 | -0.062031 | -0.026834 |
4263 rows × 10 columns
topPcaDF = pd.concat([topDF[['state']], topPcaDF], axis = 1)
topPcaDF
| state | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | california | 4.082790 | 2.473945 | -0.324879 | 1.912858 | -0.478343 | 0.427408 | 0.456770 | 0.047968 | 0.123221 | 0.069004 |
| 1 | california | 4.076047 | 2.501004 | -0.326608 | 1.887398 | -0.388557 | 0.480950 | 0.496473 | 0.054592 | 0.148776 | 0.071614 |
| 2 | california | 4.057014 | 2.557305 | -0.312479 | 1.884809 | -0.176302 | 0.383997 | 0.584514 | 0.136168 | 0.153301 | 0.067797 |
| 3 | california | 4.167776 | 2.239772 | -0.404217 | 1.761155 | -0.204045 | 0.361948 | 0.555267 | 0.107815 | 0.149130 | 0.065696 |
| 4 | california | 4.227130 | 2.070157 | -0.453873 | 1.686301 | -0.138405 | 0.308802 | 0.571605 | 0.122374 | 0.148324 | 0.062881 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4258 | wyoming | -1.176240 | -2.244319 | 1.894844 | -0.126827 | 0.340810 | 0.420539 | 0.014152 | -0.021307 | -0.077132 | -0.013028 |
| 4259 | wyoming | -1.078254 | -2.519153 | 1.807959 | -0.248485 | 0.234020 | 0.532016 | -0.042302 | -0.104481 | -0.058733 | -0.009092 |
| 4260 | wyoming | -1.071265 | -2.537821 | 1.800882 | -0.275240 | 0.397949 | 0.449902 | 0.022114 | -0.045023 | -0.055961 | -0.012570 |
| 4261 | wyoming | -0.987083 | -2.782662 | 1.734909 | -0.363761 | 0.535986 | 0.274890 | 0.056303 | -0.008086 | -0.057960 | -0.019833 |
| 4262 | wyoming | -0.875445 | -3.107016 | 1.646988 | -0.475398 | 0.618340 | 0.111523 | 0.063126 | 0.003752 | -0.062031 | -0.026834 |
4263 rows × 11 columns
loading_scores6 = pd.Series(toppca.components_[0], index= topDF.columns)
sorted_loading_scores6 = loading_scores6.abs().sort_values(ascending=False)
top10Features6 = sorted_loading_scores6[0:10].index.values
print(loading_scores6[top10Features6])
households_2019 0.390954 employed_2015 0.389889 civilian_labor_force_2016 0.389676 civilian_labor_force_2017 0.389675 civilian_labor_force_2015 0.389549 female life expectancy change 1987 to 2007 (years) 0.239753 state -0.182816 county -0.180209 fips -0.167646 white male life expectancy (years) 0.166015 dtype: float64
bottom10Features2 = sorted_loading_scores6[-10:].index.values
print(loading_scores6[bottom10Features2])
civilian_labor_force_2017 0.389675 civilian_labor_force_2015 0.389549 female life expectancy change 1987 to 2007 (years) 0.239753 state -0.182816 county -0.180209 fips -0.167646 white male life expectancy (years) 0.166015 male life expectancy (years) 0.148770 white female life expectancy (years) 0.141845 rank (male) -0.135653 dtype: float64
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA - Section B ', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = topPcaDF['state'] == target
ax.scatter(topPcaDF.loc[indicesToKeep, 'PC1']
, topPcaDF.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
Now that we merged only top features from each dataframe, we already reduced from 214 to 13 features and that improved the result, now top 2 pcs covers ~78% in compare with before (~23%).
We can notice that wyoming and south dakota form a cluster, and california with florida make another cluster.
Califorina have some outlayer counties which differs by folds from the rest of california counties (on top right of the graph).
Though the two principal component used covers ~78%, the spearation between the 4 states is not that significant.
tsne = TSNE()
stsne = tsne.fit_transform(ntopDF)
stsne
C:\Users\lover\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:783: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. FutureWarning, C:\Users\lover\anaconda3\lib\site-packages\sklearn\manifold\_t_sne.py:793: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. FutureWarning,
array([[ 58.56721 , -34.618355],
[ 58.568516, -34.62335 ],
[ 58.570873, -34.63745 ],
...,
[-27.063814, 66.90498 ],
[-27.859016, 65.35779 ],
[-29.66391 , 62.852104]], dtype=float32)
stsne = pd.DataFrame(stsne, columns=['PC1', 'PC2'])
stsne = pd.concat([stsne,topDF[['state']]], axis = 1)
stsne
| PC1 | PC2 | state | |
|---|---|---|---|
| 0 | 58.567211 | -34.618355 | california |
| 1 | 58.568516 | -34.623348 | california |
| 2 | 58.570873 | -34.637451 | california |
| 3 | 58.289040 | -34.690372 | california |
| 4 | 58.094250 | -34.738884 | california |
| ... | ... | ... | ... |
| 4258 | -24.568171 | 69.119446 | wyoming |
| 4259 | -27.116047 | 67.679237 | wyoming |
| 4260 | -27.063814 | 66.904984 | wyoming |
| 4261 | -27.859016 | 65.357788 | wyoming |
| 4262 | -29.663910 | 62.852104 | wyoming |
4263 rows × 3 columns
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component TSNE', fontsize = 20)
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = stsne['state'] == target
ax.scatter(stsne.loc[indicesToKeep, 'PC1']
, stsne.loc[indicesToKeep, 'PC2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
TSNE doesn't support explained_varianceratio so we cannot tell which features are used, more important or less important. But surprisngly enough we can see a way better sepearation between the 4 states.
Less outlayer counties in california, some outlayers in south dakota and wyoming.
Still we can see wyoming is more similiar to south dakota, florida more similiar to california.
# show the us election results dataframes
print("House")
display(house.head())
print("President")
display(president.head())
print("Senate")
display(senate.head())
House
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | runoff | special | candidate | party | writein | mode | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | bill davenport | democrat | False | total | 58906 | 157170 | False | 20220331 | False |
| 1 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | jack edwards | republican | False | total | 98257 | 157170 | False | 20220331 | False |
| 2 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | writein | unknown | True | total | 7 | 157170 | False | 20220331 | False |
| 3 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | j carole keahey | democrat | False | total | 66288 | 156362 | False | 20220331 | False |
| 4 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | william l "bill" dickinson | republican | False | total | 90069 | 156362 | False | 20220331 | False |
President
| year | state | state_po | state_fips | state_cen | state_ic | office | candidate | party_detailed | writein | candidatevotes | totalvotes | version | party_simplified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | alabama | al | 1 | 63 | 41 | us president | carter, jimmy | democrat | False | 659170 | 1182850 | 20210113 | democrat |
| 1 | 1976 | alabama | al | 1 | 63 | 41 | us president | ford, gerald | republican | False | 504070 | 1182850 | 20210113 | republican |
| 2 | 1976 | alabama | al | 1 | 63 | 41 | us president | maddox, lester | american independent party | False | 9198 | 1182850 | 20210113 | other |
| 3 | 1976 | alabama | al | 1 | 63 | 41 | us president | bubar, benjamin ""ben"" | prohibition | False | 6669 | 1182850 | 20210113 | other |
| 4 | 1976 | alabama | al | 1 | 63 | 41 | us president | hall, gus | communist party use | False | 1954 | 1182850 | 20210113 | other |
Senate
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | special | candidate | party_detailed | writein | mode | candidatevotes | totalvotes | unofficial | version | party_simplified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | SAM STEIGER | REPUBLICAN | False | total | 321236 | 741210 | False | 20210114 | REPUBLICAN |
| 1 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | WM. MATHEWS FEIGHAN | INDEPENDENT | False | total | 1565 | 741210 | False | 20210114 | OTHER |
| 2 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | DENNIS DECONCINI | DEMOCRAT | False | total | 400334 | 741210 | False | 20210114 | DEMOCRAT |
| 3 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | ALLAN NORWITZ | LIBERTARIAN | False | total | 7310 | 741210 | False | 20210114 | LIBERTARIAN |
| 4 | 1976 | ARIZONA | AZ | 4 | 86 | 61 | US SENATE | statewide | gen | False | BOB FIELD | INDEPENDENT | False | total | 10765 | 741210 | False | 20210114 | OTHER |
houseg = house.groupby(['state', 'year'], as_index=False).sum()
houseg
| state | year | state_fips | state_cen | state_ic | district | special | writein | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 1976 | 19 | 1197 | 779 | 72 | 0 | 6 | 984181 | 2728482 | 0 | 384186289 | 0 |
| 1 | alabama | 1978 | 16 | 1008 | 656 | 68 | 0 | 3 | 642279 | 1438662 | 0 | 323525296 | 0 |
| 2 | alabama | 1980 | 23 | 1449 | 943 | 98 | 0 | 3 | 1013626 | 3517307 | 0 | 465067613 | 0 |
| 3 | alabama | 1982 | 19 | 1197 | 779 | 80 | 0 | 3 | 961019 | 2629352 | 0 | 384186289 | 0 |
| 4 | alabama | 1984 | 16 | 1008 | 656 | 66 | 0 | 0 | 1148574 | 2812277 | 0 | 323525296 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1146 | wyoming | 2012 | 448 | 664 | 544 | 0 | 0 | 1 | 250700 | 2005600 | 0 | 161762648 | 0 |
| 1147 | wyoming | 2014 | 392 | 581 | 476 | 0 | 0 | 1 | 171153 | 1198071 | 0 | 141542317 | 0 |
| 1148 | wyoming | 2016 | 392 | 581 | 476 | 0 | 0 | 1 | 258788 | 1811516 | 0 | 141542317 | 0 |
| 1149 | wyoming | 2018 | 280 | 415 | 340 | 0 | 0 | 1 | 201245 | 1006225 | 0 | 101101655 | 0 |
| 1150 | wyoming | 2020 | 392 | 581 | 476 | 0 | 0 | 1 | 278503 | 1949521 | 0 | 141542317 | 0 |
1151 rows × 13 columns
houseg.head(20)
| state | year | state_fips | state_cen | state_ic | district | special | writein | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 1976 | 19 | 1197 | 779 | 72 | 0 | 6 | 984181 | 2728482 | 0 | 384186289 | 0 |
| 1 | alabama | 1978 | 16 | 1008 | 656 | 68 | 0 | 3 | 642279 | 1438662 | 0 | 323525296 | 0 |
| 2 | alabama | 1980 | 23 | 1449 | 943 | 98 | 0 | 3 | 1013626 | 3517307 | 0 | 465067613 | 0 |
| 3 | alabama | 1982 | 19 | 1197 | 779 | 80 | 0 | 3 | 961019 | 2629352 | 0 | 384186289 | 0 |
| 4 | alabama | 1984 | 16 | 1008 | 656 | 66 | 0 | 0 | 1148574 | 2812277 | 0 | 323525296 | 0 |
| 5 | alabama | 1986 | 14 | 882 | 574 | 61 | 0 | 0 | 1115517 | 2326541 | 0 | 283084634 | 0 |
| 6 | alabama | 1988 | 25 | 1575 | 1025 | 103 | 0 | 7 | 1178298 | 4325872 | 0 | 505508275 | 0 |
| 7 | alabama | 1990 | 19 | 1197 | 779 | 79 | 0 | 6 | 1015869 | 2834865 | 0 | 384186289 | 0 |
| 8 | alabama | 1992 | 34 | 2142 | 1394 | 143 | 0 | 7 | 1602536 | 7769922 | 0 | 687491254 | 0 |
| 9 | alabama | 1994 | 20 | 1260 | 820 | 80 | 0 | 7 | 1115019 | 3223795 | 0 | 404406620 | 0 |
| 10 | alabama | 1996 | 31 | 1953 | 1271 | 126 | 0 | 7 | 1468693 | 6530225 | 0 | 626830261 | 0 |
| 11 | alabama | 1998 | 19 | 1197 | 779 | 76 | 0 | 7 | 1215179 | 3392792 | 0 | 384186289 | 0 |
| 12 | alabama | 2000 | 24 | 1512 | 984 | 97 | 0 | 7 | 1438994 | 4969357 | 0 | 485287944 | 0 |
| 13 | alabama | 2002 | 25 | 1575 | 1025 | 95 | 0 | 7 | 1268802 | 4549452 | 0 | 505508275 | 0 |
| 14 | alabama | 2004 | 20 | 1260 | 820 | 78 | 0 | 7 | 1792759 | 5110234 | 0 | 404406620 | 0 |
| 15 | alabama | 2006 | 19 | 1197 | 779 | 69 | 0 | 7 | 1140152 | 3138738 | 0 | 384186289 | 0 |
| 16 | alabama | 2008 | 18 | 1134 | 738 | 70 | 0 | 7 | 1855268 | 4832499 | 0 | 363965958 | 0 |
| 17 | alabama | 2010 | 19 | 1197 | 779 | 74 | 0 | 7 | 1367747 | 3724156 | 0 | 384186289 | 0 |
| 18 | alabama | 2012 | 20 | 1260 | 820 | 83 | 0 | 7 | 1933630 | 5600214 | 0 | 404406620 | 0 |
| 19 | alabama | 2014 | 19 | 1197 | 779 | 73 | 0 | 7 | 1080880 | 2971989 | 0 | 384186289 | 0 |
houseg.columns
Index(['state', 'year', 'state_fips', 'state_cen', 'state_ic', 'district',
'special', 'writein', 'candidatevotes', 'totalvotes', 'unofficial',
'version', 'fusion_ticket'],
dtype='object')
houseg = houseg[['state', 'candidatevotes']]
houseg
| state | candidatevotes | |
|---|---|---|
| 0 | alabama | 984181 |
| 1 | alabama | 642279 |
| 2 | alabama | 1013626 |
| 3 | alabama | 961019 |
| 4 | alabama | 1148574 |
| ... | ... | ... |
| 1146 | wyoming | 250700 |
| 1147 | wyoming | 171153 |
| 1148 | wyoming | 258788 |
| 1149 | wyoming | 201245 |
| 1150 | wyoming | 278503 |
1151 rows × 2 columns
print(type(houseg))
<class 'pandas.core.frame.DataFrame'>
compg = complete.groupby(['state'], as_index=False).agg({'pop2010': 'sum', 'age_under_18_2010': 'mean'})
compg
| state | pop2010 | age_under_18_2010 | |
|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 |
| 1 | alaska | 710231.0 | 25.493103 |
| 2 | arizona | 6392017.0 | 25.306667 |
| 3 | arkansas | 2915918.0 | 23.542667 |
| 4 | california | 37253956.0 | 23.658621 |
| 5 | colorado | 5029196.0 | 22.057813 |
| 6 | connecticut | 3574097.0 | 22.125000 |
| 7 | delaware | 897934.0 | 22.833333 |
| 8 | district of columbia | 601723.0 | 16.800000 |
| 9 | florida | 18801310.0 | 20.776119 |
| 10 | georgia | 9687653.0 | 24.550314 |
| 11 | hawaii | 1360301.0 | 18.140000 |
| 12 | idaho | 1567582.0 | 26.511364 |
| 13 | illinois | 12830632.0 | 22.741176 |
| 14 | indiana | 6483802.0 | 24.410870 |
| 15 | iowa | 3046355.0 | 23.644444 |
| 16 | kansas | 2853118.0 | 24.143810 |
| 17 | kentucky | 4339367.0 | 23.425833 |
| 18 | louisiana | 4533372.0 | 24.717188 |
| 19 | maine | 1328361.0 | 20.337500 |
| 20 | maryland | 5773552.0 | 22.825000 |
| 21 | massachusetts | 6547629.0 | 20.821429 |
| 22 | michigan | 9883640.0 | 21.756627 |
| 23 | minnesota | 5303925.0 | 23.708046 |
| 24 | mississippi | 2967297.0 | 25.182927 |
| 25 | missouri | 5988927.0 | 23.580870 |
| 26 | montana | 989415.0 | 22.301786 |
| 27 | nebraska | 1826341.0 | 23.709677 |
| 28 | nevada | 2700551.0 | 22.952941 |
| 29 | new hampshire | 1316470.0 | 20.600000 |
| 30 | new jersey | 8791894.0 | 23.428571 |
| 31 | new mexico | 2059179.0 | 23.624242 |
| 32 | new york | 19378102.0 | 21.903226 |
| 33 | north carolina | 9535483.0 | 22.575000 |
| 34 | north dakota | 672591.0 | 21.805660 |
| 35 | ohio | 11536504.0 | 23.939773 |
| 36 | oklahoma | 3751351.0 | 24.337662 |
| 37 | oregon | 3831074.0 | 21.869444 |
| 38 | pennsylvania | 12702379.0 | 21.217910 |
| 39 | rhode island | 1052567.0 | 20.580000 |
| 40 | south carolina | 4625364.0 | 23.213043 |
| 41 | south dakota | 814180.0 | 25.281818 |
| 42 | tennessee | 6346105.0 | 22.808421 |
| 43 | texas | 25145561.0 | 25.064173 |
| 44 | utah | 2763885.0 | 31.079310 |
| 45 | vermont | 625741.0 | 20.785714 |
| 46 | virginia | 7994802.0 | 21.342105 |
| 47 | washington | 6724540.0 | 22.794872 |
| 48 | west virginia | 1852994.0 | 20.760000 |
| 49 | wisconsin | 5686986.0 | 22.684722 |
| 50 | wyoming | 563626.0 | 23.382609 |
compg['can_vote_2010'] = compg['pop2010'] * (1 -compg['age_under_18_2010']/100)
compg
| state | pop2010 | age_under_18_2010 | can_vote_2010 | |
|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 |
| 4 | california | 37253956.0 | 23.658621 | 2.844018e+07 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 |
| 9 | florida | 18801310.0 | 20.776119 | 1.489513e+07 |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 |
| 20 | maryland | 5773552.0 | 22.825000 | 4.455739e+06 |
| 21 | massachusetts | 6547629.0 | 20.821429 | 5.184319e+06 |
| 22 | michigan | 9883640.0 | 21.756627 | 7.733293e+06 |
| 23 | minnesota | 5303925.0 | 23.708046 | 4.046468e+06 |
| 24 | mississippi | 2967297.0 | 25.182927 | 2.220045e+06 |
| 25 | missouri | 5988927.0 | 23.580870 | 4.576686e+06 |
| 26 | montana | 989415.0 | 22.301786 | 7.687578e+05 |
| 27 | nebraska | 1826341.0 | 23.709677 | 1.393321e+06 |
| 28 | nevada | 2700551.0 | 22.952941 | 2.080695e+06 |
| 29 | new hampshire | 1316470.0 | 20.600000 | 1.045277e+06 |
| 30 | new jersey | 8791894.0 | 23.428571 | 6.732079e+06 |
| 31 | new mexico | 2059179.0 | 23.624242 | 1.572714e+06 |
| 32 | new york | 19378102.0 | 21.903226 | 1.513367e+07 |
| 33 | north carolina | 9535483.0 | 22.575000 | 7.382848e+06 |
| 34 | north dakota | 672591.0 | 21.805660 | 5.259281e+05 |
| 35 | ohio | 11536504.0 | 23.939773 | 8.774691e+06 |
| 36 | oklahoma | 3751351.0 | 24.337662 | 2.838360e+06 |
| 37 | oregon | 3831074.0 | 21.869444 | 2.993239e+06 |
| 38 | pennsylvania | 12702379.0 | 21.217910 | 1.000720e+07 |
| 39 | rhode island | 1052567.0 | 20.580000 | 8.359487e+05 |
| 40 | south carolina | 4625364.0 | 23.213043 | 3.551676e+06 |
| 41 | south dakota | 814180.0 | 25.281818 | 6.083405e+05 |
| 42 | tennessee | 6346105.0 | 22.808421 | 4.898659e+06 |
| 43 | texas | 25145561.0 | 25.064173 | 1.884303e+07 |
| 44 | utah | 2763885.0 | 31.079310 | 1.904889e+06 |
| 45 | vermont | 625741.0 | 20.785714 | 4.956763e+05 |
| 46 | virginia | 7994802.0 | 21.342105 | 6.288543e+06 |
| 47 | washington | 6724540.0 | 22.794872 | 5.191690e+06 |
| 48 | west virginia | 1852994.0 | 20.760000 | 1.468312e+06 |
| 49 | wisconsin | 5686986.0 | 22.684722 | 4.396909e+06 |
| 50 | wyoming | 563626.0 | 23.382609 | 4.318355e+05 |
print(type(compg))
<class 'pandas.core.frame.DataFrame'>
newtest = house.groupby(['state','year'], as_index=False).agg({'candidatevotes': 'sum'})
newtest = newtest.groupby('year').get_group((2010))
newtest
| state | year | candidatevotes | |
|---|---|---|---|
| 17 | alabama | 2010 | 1367747 |
| 40 | alaska | 2010 | 254335 |
| 63 | arizona | 2010 | 1698145 |
| 86 | arkansas | 2010 | 774125 |
| 109 | california | 2010 | 9648096 |
| 132 | colorado | 2010 | 1763152 |
| 155 | connecticut | 2010 | 1138202 |
| 178 | delaware | 2010 | 305636 |
| 202 | florida | 2010 | 5117811 |
| 225 | georgia | 2010 | 2468680 |
| 248 | hawaii | 2010 | 360121 |
| 271 | idaho | 2010 | 447144 |
| 294 | illinois | 2010 | 3696159 |
| 317 | indiana | 2010 | 1747720 |
| 340 | iowa | 2010 | 1106591 |
| 363 | kansas | 2010 | 835529 |
| 386 | kentucky | 2010 | 1354298 |
| 409 | louisiana | 2010 | 1035948 |
| 432 | maine | 2010 | 564368 |
| 455 | maryland | 2010 | 1825472 |
| 478 | massachusetts | 2010 | 2224255 |
| 501 | michigan | 2010 | 3194901 |
| 524 | minnesota | 2010 | 2090701 |
| 547 | mississippi | 2010 | 788549 |
| 570 | missouri | 2010 | 1920675 |
| 593 | montana | 2010 | 360341 |
| 616 | nebraska | 2010 | 485546 |
| 639 | nevada | 2010 | 702788 |
| 662 | new hampshire | 2010 | 449787 |
| 685 | new jersey | 2010 | 2121584 |
| 708 | new mexico | 2010 | 596651 |
| 731 | new york | 2010 | 4753783 |
| 754 | north carolina | 2010 | 2662549 |
| 777 | north dakota | 2010 | 236344 |
| 800 | ohio | 2010 | 3825274 |
| 823 | oklahoma | 2010 | 792980 |
| 846 | oregon | 2010 | 1429356 |
| 869 | pennsylvania | 2010 | 3956401 |
| 892 | rhode island | 2010 | 335484 |
| 915 | south carolina | 2010 | 1340189 |
| 938 | south dakota | 2010 | 319426 |
| 961 | tennessee | 2010 | 1559129 |
| 984 | texas | 2010 | 4745545 |
| 1007 | utah | 2010 | 640495 |
| 1030 | vermont | 2010 | 238521 |
| 1053 | virginia | 2010 | 2189841 |
| 1076 | washington | 2010 | 2479409 |
| 1099 | west virginia | 2010 | 514373 |
| 1122 | wisconsin | 2010 | 2140482 |
| 1145 | wyoming | 2010 | 190822 |
fullh = pd.merge(compg, newtest, on=['state'], how='outer')
fullh
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | |
|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 | 2010.0 | 1367747.0 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 | 2010.0 | 254335.0 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 | 2010.0 | 1698145.0 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 | 2010.0 | 774125.0 |
| 4 | california | 37253956.0 | 23.658621 | 2.844018e+07 | 2010.0 | 9648096.0 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 | 2010.0 | 1763152.0 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 | 2010.0 | 1138202.0 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 | 2010.0 | 305636.0 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 | NaN | NaN |
| 9 | florida | 18801310.0 | 20.776119 | 1.489513e+07 | 2010.0 | 5117811.0 |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 | 2010.0 | 2468680.0 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 | 2010.0 | 360121.0 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 | 2010.0 | 447144.0 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 | 2010.0 | 3696159.0 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 | 2010.0 | 1747720.0 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 | 2010.0 | 1106591.0 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 | 2010.0 | 835529.0 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 | 2010.0 | 1354298.0 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 | 2010.0 | 1035948.0 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 | 2010.0 | 564368.0 |
| 20 | maryland | 5773552.0 | 22.825000 | 4.455739e+06 | 2010.0 | 1825472.0 |
| 21 | massachusetts | 6547629.0 | 20.821429 | 5.184319e+06 | 2010.0 | 2224255.0 |
| 22 | michigan | 9883640.0 | 21.756627 | 7.733293e+06 | 2010.0 | 3194901.0 |
| 23 | minnesota | 5303925.0 | 23.708046 | 4.046468e+06 | 2010.0 | 2090701.0 |
| 24 | mississippi | 2967297.0 | 25.182927 | 2.220045e+06 | 2010.0 | 788549.0 |
| 25 | missouri | 5988927.0 | 23.580870 | 4.576686e+06 | 2010.0 | 1920675.0 |
| 26 | montana | 989415.0 | 22.301786 | 7.687578e+05 | 2010.0 | 360341.0 |
| 27 | nebraska | 1826341.0 | 23.709677 | 1.393321e+06 | 2010.0 | 485546.0 |
| 28 | nevada | 2700551.0 | 22.952941 | 2.080695e+06 | 2010.0 | 702788.0 |
| 29 | new hampshire | 1316470.0 | 20.600000 | 1.045277e+06 | 2010.0 | 449787.0 |
| 30 | new jersey | 8791894.0 | 23.428571 | 6.732079e+06 | 2010.0 | 2121584.0 |
| 31 | new mexico | 2059179.0 | 23.624242 | 1.572714e+06 | 2010.0 | 596651.0 |
| 32 | new york | 19378102.0 | 21.903226 | 1.513367e+07 | 2010.0 | 4753783.0 |
| 33 | north carolina | 9535483.0 | 22.575000 | 7.382848e+06 | 2010.0 | 2662549.0 |
| 34 | north dakota | 672591.0 | 21.805660 | 5.259281e+05 | 2010.0 | 236344.0 |
| 35 | ohio | 11536504.0 | 23.939773 | 8.774691e+06 | 2010.0 | 3825274.0 |
| 36 | oklahoma | 3751351.0 | 24.337662 | 2.838360e+06 | 2010.0 | 792980.0 |
| 37 | oregon | 3831074.0 | 21.869444 | 2.993239e+06 | 2010.0 | 1429356.0 |
| 38 | pennsylvania | 12702379.0 | 21.217910 | 1.000720e+07 | 2010.0 | 3956401.0 |
| 39 | rhode island | 1052567.0 | 20.580000 | 8.359487e+05 | 2010.0 | 335484.0 |
| 40 | south carolina | 4625364.0 | 23.213043 | 3.551676e+06 | 2010.0 | 1340189.0 |
| 41 | south dakota | 814180.0 | 25.281818 | 6.083405e+05 | 2010.0 | 319426.0 |
| 42 | tennessee | 6346105.0 | 22.808421 | 4.898659e+06 | 2010.0 | 1559129.0 |
| 43 | texas | 25145561.0 | 25.064173 | 1.884303e+07 | 2010.0 | 4745545.0 |
| 44 | utah | 2763885.0 | 31.079310 | 1.904889e+06 | 2010.0 | 640495.0 |
| 45 | vermont | 625741.0 | 20.785714 | 4.956763e+05 | 2010.0 | 238521.0 |
| 46 | virginia | 7994802.0 | 21.342105 | 6.288543e+06 | 2010.0 | 2189841.0 |
| 47 | washington | 6724540.0 | 22.794872 | 5.191690e+06 | 2010.0 | 2479409.0 |
| 48 | west virginia | 1852994.0 | 20.760000 | 1.468312e+06 | 2010.0 | 514373.0 |
| 49 | wisconsin | 5686986.0 | 22.684722 | 4.396909e+06 | 2010.0 | 2140482.0 |
| 50 | wyoming | 563626.0 | 23.382609 | 4.318355e+05 | 2010.0 | 190822.0 |
fullh.head(20)
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | |
|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 | 2010.0 | 1367747.0 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 | 2010.0 | 254335.0 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 | 2010.0 | 1698145.0 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 | 2010.0 | 774125.0 |
| 4 | california | 37253956.0 | 23.658621 | 2.844018e+07 | 2010.0 | 9648096.0 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 | 2010.0 | 1763152.0 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 | 2010.0 | 1138202.0 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 | 2010.0 | 305636.0 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 | NaN | NaN |
| 9 | florida | 18801310.0 | 20.776119 | 1.489513e+07 | 2010.0 | 5117811.0 |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 | 2010.0 | 2468680.0 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 | 2010.0 | 360121.0 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 | 2010.0 | 447144.0 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 | 2010.0 | 3696159.0 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 | 2010.0 | 1747720.0 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 | 2010.0 | 1106591.0 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 | 2010.0 | 835529.0 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 | 2010.0 | 1354298.0 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 | 2010.0 | 1035948.0 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 | 2010.0 | 564368.0 |
turn out for year 2010 is candidatevotes sum from all counties and districts in the state that election cycle (column : candidatevotes) divided by column (can_vote_2010) which was extracted from county complete pop2010 indicating the population in each county in the year 2010 times (1-age_under_18_2010/100).
Ages under 18 cannot vote so dropped them.
fullh['voter_turnout_2010'] = fullh['candidatevotes']/fullh['can_vote_2010']
fullh
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | voter_turnout_2010 | |
|---|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 | 2010.0 | 1367747.0 | 0.373957 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 | 2010.0 | 254335.0 | 0.480629 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 | 2010.0 | 1698145.0 | 0.355676 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 | 2010.0 | 774125.0 | 0.347230 |
| 4 | california | 37253956.0 | 23.658621 | 2.844018e+07 | 2010.0 | 9648096.0 | 0.339242 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 | 2010.0 | 1763152.0 | 0.449799 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 | 2010.0 | 1138202.0 | 0.408936 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 | 2010.0 | 305636.0 | 0.441093 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 | NaN | NaN | NaN |
| 9 | florida | 18801310.0 | 20.776119 | 1.489513e+07 | 2010.0 | 5117811.0 | 0.343590 |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 | 2010.0 | 2468680.0 | 0.337745 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 | 2010.0 | 360121.0 | 0.323401 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 | 2010.0 | 447144.0 | 0.388148 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 | 2010.0 | 3696159.0 | 0.372867 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 | 2010.0 | 1747720.0 | 0.356601 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 | 2010.0 | 1106591.0 | 0.475736 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 | 2010.0 | 835529.0 | 0.386056 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 | 2010.0 | 1354298.0 | 0.407573 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 | 2010.0 | 1035948.0 | 0.303543 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 | 2010.0 | 564368.0 | 0.533325 |
| 20 | maryland | 5773552.0 | 22.825000 | 4.455739e+06 | 2010.0 | 1825472.0 | 0.409690 |
| 21 | massachusetts | 6547629.0 | 20.821429 | 5.184319e+06 | 2010.0 | 2224255.0 | 0.429035 |
| 22 | michigan | 9883640.0 | 21.756627 | 7.733293e+06 | 2010.0 | 3194901.0 | 0.413136 |
| 23 | minnesota | 5303925.0 | 23.708046 | 4.046468e+06 | 2010.0 | 2090701.0 | 0.516673 |
| 24 | mississippi | 2967297.0 | 25.182927 | 2.220045e+06 | 2010.0 | 788549.0 | 0.355195 |
| 25 | missouri | 5988927.0 | 23.580870 | 4.576686e+06 | 2010.0 | 1920675.0 | 0.419665 |
| 26 | montana | 989415.0 | 22.301786 | 7.687578e+05 | 2010.0 | 360341.0 | 0.468732 |
| 27 | nebraska | 1826341.0 | 23.709677 | 1.393321e+06 | 2010.0 | 485546.0 | 0.348481 |
| 28 | nevada | 2700551.0 | 22.952941 | 2.080695e+06 | 2010.0 | 702788.0 | 0.337766 |
| 29 | new hampshire | 1316470.0 | 20.600000 | 1.045277e+06 | 2010.0 | 449787.0 | 0.430304 |
| 30 | new jersey | 8791894.0 | 23.428571 | 6.732079e+06 | 2010.0 | 2121584.0 | 0.315145 |
| 31 | new mexico | 2059179.0 | 23.624242 | 1.572714e+06 | 2010.0 | 596651.0 | 0.379377 |
| 32 | new york | 19378102.0 | 21.903226 | 1.513367e+07 | 2010.0 | 4753783.0 | 0.314120 |
| 33 | north carolina | 9535483.0 | 22.575000 | 7.382848e+06 | 2010.0 | 2662549.0 | 0.360640 |
| 34 | north dakota | 672591.0 | 21.805660 | 5.259281e+05 | 2010.0 | 236344.0 | 0.449385 |
| 35 | ohio | 11536504.0 | 23.939773 | 8.774691e+06 | 2010.0 | 3825274.0 | 0.435944 |
| 36 | oklahoma | 3751351.0 | 24.337662 | 2.838360e+06 | 2010.0 | 792980.0 | 0.279380 |
| 37 | oregon | 3831074.0 | 21.869444 | 2.993239e+06 | 2010.0 | 1429356.0 | 0.477528 |
| 38 | pennsylvania | 12702379.0 | 21.217910 | 1.000720e+07 | 2010.0 | 3956401.0 | 0.395355 |
| 39 | rhode island | 1052567.0 | 20.580000 | 8.359487e+05 | 2010.0 | 335484.0 | 0.401321 |
| 40 | south carolina | 4625364.0 | 23.213043 | 3.551676e+06 | 2010.0 | 1340189.0 | 0.377340 |
| 41 | south dakota | 814180.0 | 25.281818 | 6.083405e+05 | 2010.0 | 319426.0 | 0.525078 |
| 42 | tennessee | 6346105.0 | 22.808421 | 4.898659e+06 | 2010.0 | 1559129.0 | 0.318277 |
| 43 | texas | 25145561.0 | 25.064173 | 1.884303e+07 | 2010.0 | 4745545.0 | 0.251846 |
| 44 | utah | 2763885.0 | 31.079310 | 1.904889e+06 | 2010.0 | 640495.0 | 0.336238 |
| 45 | vermont | 625741.0 | 20.785714 | 4.956763e+05 | 2010.0 | 238521.0 | 0.481203 |
| 46 | virginia | 7994802.0 | 21.342105 | 6.288543e+06 | 2010.0 | 2189841.0 | 0.348227 |
| 47 | washington | 6724540.0 | 22.794872 | 5.191690e+06 | 2010.0 | 2479409.0 | 0.477573 |
| 48 | west virginia | 1852994.0 | 20.760000 | 1.468312e+06 | 2010.0 | 514373.0 | 0.350316 |
| 49 | wisconsin | 5686986.0 | 22.684722 | 4.396909e+06 | 2010.0 | 2140482.0 | 0.486815 |
| 50 | wyoming | 563626.0 | 23.382609 | 4.318355e+05 | 2010.0 | 190822.0 | 0.441886 |
x = fullh.drop(columns=['voter_turnout_2010'])
y = fullh[['state', 'voter_turnout_2010']]
display(x)
display(y)
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | |
|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 | 2010.0 | 1367747.0 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 | 2010.0 | 254335.0 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 | 2010.0 | 1698145.0 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 | 2010.0 | 774125.0 |
| 4 | california | 37253956.0 | 23.658621 | 2.844018e+07 | 2010.0 | 9648096.0 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 | 2010.0 | 1763152.0 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 | 2010.0 | 1138202.0 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 | 2010.0 | 305636.0 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 | NaN | NaN |
| 9 | florida | 18801310.0 | 20.776119 | 1.489513e+07 | 2010.0 | 5117811.0 |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 | 2010.0 | 2468680.0 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 | 2010.0 | 360121.0 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 | 2010.0 | 447144.0 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 | 2010.0 | 3696159.0 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 | 2010.0 | 1747720.0 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 | 2010.0 | 1106591.0 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 | 2010.0 | 835529.0 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 | 2010.0 | 1354298.0 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 | 2010.0 | 1035948.0 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 | 2010.0 | 564368.0 |
| 20 | maryland | 5773552.0 | 22.825000 | 4.455739e+06 | 2010.0 | 1825472.0 |
| 21 | massachusetts | 6547629.0 | 20.821429 | 5.184319e+06 | 2010.0 | 2224255.0 |
| 22 | michigan | 9883640.0 | 21.756627 | 7.733293e+06 | 2010.0 | 3194901.0 |
| 23 | minnesota | 5303925.0 | 23.708046 | 4.046468e+06 | 2010.0 | 2090701.0 |
| 24 | mississippi | 2967297.0 | 25.182927 | 2.220045e+06 | 2010.0 | 788549.0 |
| 25 | missouri | 5988927.0 | 23.580870 | 4.576686e+06 | 2010.0 | 1920675.0 |
| 26 | montana | 989415.0 | 22.301786 | 7.687578e+05 | 2010.0 | 360341.0 |
| 27 | nebraska | 1826341.0 | 23.709677 | 1.393321e+06 | 2010.0 | 485546.0 |
| 28 | nevada | 2700551.0 | 22.952941 | 2.080695e+06 | 2010.0 | 702788.0 |
| 29 | new hampshire | 1316470.0 | 20.600000 | 1.045277e+06 | 2010.0 | 449787.0 |
| 30 | new jersey | 8791894.0 | 23.428571 | 6.732079e+06 | 2010.0 | 2121584.0 |
| 31 | new mexico | 2059179.0 | 23.624242 | 1.572714e+06 | 2010.0 | 596651.0 |
| 32 | new york | 19378102.0 | 21.903226 | 1.513367e+07 | 2010.0 | 4753783.0 |
| 33 | north carolina | 9535483.0 | 22.575000 | 7.382848e+06 | 2010.0 | 2662549.0 |
| 34 | north dakota | 672591.0 | 21.805660 | 5.259281e+05 | 2010.0 | 236344.0 |
| 35 | ohio | 11536504.0 | 23.939773 | 8.774691e+06 | 2010.0 | 3825274.0 |
| 36 | oklahoma | 3751351.0 | 24.337662 | 2.838360e+06 | 2010.0 | 792980.0 |
| 37 | oregon | 3831074.0 | 21.869444 | 2.993239e+06 | 2010.0 | 1429356.0 |
| 38 | pennsylvania | 12702379.0 | 21.217910 | 1.000720e+07 | 2010.0 | 3956401.0 |
| 39 | rhode island | 1052567.0 | 20.580000 | 8.359487e+05 | 2010.0 | 335484.0 |
| 40 | south carolina | 4625364.0 | 23.213043 | 3.551676e+06 | 2010.0 | 1340189.0 |
| 41 | south dakota | 814180.0 | 25.281818 | 6.083405e+05 | 2010.0 | 319426.0 |
| 42 | tennessee | 6346105.0 | 22.808421 | 4.898659e+06 | 2010.0 | 1559129.0 |
| 43 | texas | 25145561.0 | 25.064173 | 1.884303e+07 | 2010.0 | 4745545.0 |
| 44 | utah | 2763885.0 | 31.079310 | 1.904889e+06 | 2010.0 | 640495.0 |
| 45 | vermont | 625741.0 | 20.785714 | 4.956763e+05 | 2010.0 | 238521.0 |
| 46 | virginia | 7994802.0 | 21.342105 | 6.288543e+06 | 2010.0 | 2189841.0 |
| 47 | washington | 6724540.0 | 22.794872 | 5.191690e+06 | 2010.0 | 2479409.0 |
| 48 | west virginia | 1852994.0 | 20.760000 | 1.468312e+06 | 2010.0 | 514373.0 |
| 49 | wisconsin | 5686986.0 | 22.684722 | 4.396909e+06 | 2010.0 | 2140482.0 |
| 50 | wyoming | 563626.0 | 23.382609 | 4.318355e+05 | 2010.0 | 190822.0 |
| state | voter_turnout_2010 | |
|---|---|---|
| 0 | alabama | 0.373957 |
| 1 | alaska | 0.480629 |
| 2 | arizona | 0.355676 |
| 3 | arkansas | 0.347230 |
| 4 | california | 0.339242 |
| 5 | colorado | 0.449799 |
| 6 | connecticut | 0.408936 |
| 7 | delaware | 0.441093 |
| 8 | district of columbia | NaN |
| 9 | florida | 0.343590 |
| 10 | georgia | 0.337745 |
| 11 | hawaii | 0.323401 |
| 12 | idaho | 0.388148 |
| 13 | illinois | 0.372867 |
| 14 | indiana | 0.356601 |
| 15 | iowa | 0.475736 |
| 16 | kansas | 0.386056 |
| 17 | kentucky | 0.407573 |
| 18 | louisiana | 0.303543 |
| 19 | maine | 0.533325 |
| 20 | maryland | 0.409690 |
| 21 | massachusetts | 0.429035 |
| 22 | michigan | 0.413136 |
| 23 | minnesota | 0.516673 |
| 24 | mississippi | 0.355195 |
| 25 | missouri | 0.419665 |
| 26 | montana | 0.468732 |
| 27 | nebraska | 0.348481 |
| 28 | nevada | 0.337766 |
| 29 | new hampshire | 0.430304 |
| 30 | new jersey | 0.315145 |
| 31 | new mexico | 0.379377 |
| 32 | new york | 0.314120 |
| 33 | north carolina | 0.360640 |
| 34 | north dakota | 0.449385 |
| 35 | ohio | 0.435944 |
| 36 | oklahoma | 0.279380 |
| 37 | oregon | 0.477528 |
| 38 | pennsylvania | 0.395355 |
| 39 | rhode island | 0.401321 |
| 40 | south carolina | 0.377340 |
| 41 | south dakota | 0.525078 |
| 42 | tennessee | 0.318277 |
| 43 | texas | 0.251846 |
| 44 | utah | 0.336238 |
| 45 | vermont | 0.481203 |
| 46 | virginia | 0.348227 |
| 47 | washington | 0.477573 |
| 48 | west virginia | 0.350316 |
| 49 | wisconsin | 0.486815 |
| 50 | wyoming | 0.441886 |
x_test = x[x['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
y_test = y[y['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
x_train = x[~x['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
y_train = y[~y['state'].isin(['california', 'florida', 'south dakota', 'wyoming'])]
display(x_train)
display(y_train)
'''
x_test = x.get(x['state'].isin(['california', 'florida', 'south dakota', 'wyoming']))
x_test
'''
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | |
|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 23.479104 | 3.657497e+06 | 2010.0 | 1367747.0 |
| 1 | alaska | 710231.0 | 25.493103 | 5.291711e+05 | 2010.0 | 254335.0 |
| 2 | arizona | 6392017.0 | 25.306667 | 4.774411e+06 | 2010.0 | 1698145.0 |
| 3 | arkansas | 2915918.0 | 23.542667 | 2.229433e+06 | 2010.0 | 774125.0 |
| 5 | colorado | 5029196.0 | 22.057813 | 3.919865e+06 | 2010.0 | 1763152.0 |
| 6 | connecticut | 3574097.0 | 22.125000 | 2.783328e+06 | 2010.0 | 1138202.0 |
| 7 | delaware | 897934.0 | 22.833333 | 6.929057e+05 | 2010.0 | 305636.0 |
| 8 | district of columbia | 601723.0 | 16.800000 | 5.006335e+05 | NaN | NaN |
| 10 | georgia | 9687653.0 | 24.550314 | 7.309304e+06 | 2010.0 | 2468680.0 |
| 11 | hawaii | 1360301.0 | 18.140000 | 1.113542e+06 | 2010.0 | 360121.0 |
| 12 | idaho | 1567582.0 | 26.511364 | 1.151995e+06 | 2010.0 | 447144.0 |
| 13 | illinois | 12830632.0 | 22.741176 | 9.912795e+06 | 2010.0 | 3696159.0 |
| 14 | indiana | 6483802.0 | 24.410870 | 4.901050e+06 | 2010.0 | 1747720.0 |
| 15 | iowa | 3046355.0 | 23.644444 | 2.326061e+06 | 2010.0 | 1106591.0 |
| 16 | kansas | 2853118.0 | 24.143810 | 2.164267e+06 | 2010.0 | 835529.0 |
| 17 | kentucky | 4339367.0 | 23.425833 | 3.322834e+06 | 2010.0 | 1354298.0 |
| 18 | louisiana | 4533372.0 | 24.717188 | 3.412850e+06 | 2010.0 | 1035948.0 |
| 19 | maine | 1328361.0 | 20.337500 | 1.058206e+06 | 2010.0 | 564368.0 |
| 20 | maryland | 5773552.0 | 22.825000 | 4.455739e+06 | 2010.0 | 1825472.0 |
| 21 | massachusetts | 6547629.0 | 20.821429 | 5.184319e+06 | 2010.0 | 2224255.0 |
| 22 | michigan | 9883640.0 | 21.756627 | 7.733293e+06 | 2010.0 | 3194901.0 |
| 23 | minnesota | 5303925.0 | 23.708046 | 4.046468e+06 | 2010.0 | 2090701.0 |
| 24 | mississippi | 2967297.0 | 25.182927 | 2.220045e+06 | 2010.0 | 788549.0 |
| 25 | missouri | 5988927.0 | 23.580870 | 4.576686e+06 | 2010.0 | 1920675.0 |
| 26 | montana | 989415.0 | 22.301786 | 7.687578e+05 | 2010.0 | 360341.0 |
| 27 | nebraska | 1826341.0 | 23.709677 | 1.393321e+06 | 2010.0 | 485546.0 |
| 28 | nevada | 2700551.0 | 22.952941 | 2.080695e+06 | 2010.0 | 702788.0 |
| 29 | new hampshire | 1316470.0 | 20.600000 | 1.045277e+06 | 2010.0 | 449787.0 |
| 30 | new jersey | 8791894.0 | 23.428571 | 6.732079e+06 | 2010.0 | 2121584.0 |
| 31 | new mexico | 2059179.0 | 23.624242 | 1.572714e+06 | 2010.0 | 596651.0 |
| 32 | new york | 19378102.0 | 21.903226 | 1.513367e+07 | 2010.0 | 4753783.0 |
| 33 | north carolina | 9535483.0 | 22.575000 | 7.382848e+06 | 2010.0 | 2662549.0 |
| 34 | north dakota | 672591.0 | 21.805660 | 5.259281e+05 | 2010.0 | 236344.0 |
| 35 | ohio | 11536504.0 | 23.939773 | 8.774691e+06 | 2010.0 | 3825274.0 |
| 36 | oklahoma | 3751351.0 | 24.337662 | 2.838360e+06 | 2010.0 | 792980.0 |
| 37 | oregon | 3831074.0 | 21.869444 | 2.993239e+06 | 2010.0 | 1429356.0 |
| 38 | pennsylvania | 12702379.0 | 21.217910 | 1.000720e+07 | 2010.0 | 3956401.0 |
| 39 | rhode island | 1052567.0 | 20.580000 | 8.359487e+05 | 2010.0 | 335484.0 |
| 40 | south carolina | 4625364.0 | 23.213043 | 3.551676e+06 | 2010.0 | 1340189.0 |
| 42 | tennessee | 6346105.0 | 22.808421 | 4.898659e+06 | 2010.0 | 1559129.0 |
| 43 | texas | 25145561.0 | 25.064173 | 1.884303e+07 | 2010.0 | 4745545.0 |
| 44 | utah | 2763885.0 | 31.079310 | 1.904889e+06 | 2010.0 | 640495.0 |
| 45 | vermont | 625741.0 | 20.785714 | 4.956763e+05 | 2010.0 | 238521.0 |
| 46 | virginia | 7994802.0 | 21.342105 | 6.288543e+06 | 2010.0 | 2189841.0 |
| 47 | washington | 6724540.0 | 22.794872 | 5.191690e+06 | 2010.0 | 2479409.0 |
| 48 | west virginia | 1852994.0 | 20.760000 | 1.468312e+06 | 2010.0 | 514373.0 |
| 49 | wisconsin | 5686986.0 | 22.684722 | 4.396909e+06 | 2010.0 | 2140482.0 |
| state | voter_turnout_2010 | |
|---|---|---|
| 0 | alabama | 0.373957 |
| 1 | alaska | 0.480629 |
| 2 | arizona | 0.355676 |
| 3 | arkansas | 0.347230 |
| 5 | colorado | 0.449799 |
| 6 | connecticut | 0.408936 |
| 7 | delaware | 0.441093 |
| 8 | district of columbia | NaN |
| 10 | georgia | 0.337745 |
| 11 | hawaii | 0.323401 |
| 12 | idaho | 0.388148 |
| 13 | illinois | 0.372867 |
| 14 | indiana | 0.356601 |
| 15 | iowa | 0.475736 |
| 16 | kansas | 0.386056 |
| 17 | kentucky | 0.407573 |
| 18 | louisiana | 0.303543 |
| 19 | maine | 0.533325 |
| 20 | maryland | 0.409690 |
| 21 | massachusetts | 0.429035 |
| 22 | michigan | 0.413136 |
| 23 | minnesota | 0.516673 |
| 24 | mississippi | 0.355195 |
| 25 | missouri | 0.419665 |
| 26 | montana | 0.468732 |
| 27 | nebraska | 0.348481 |
| 28 | nevada | 0.337766 |
| 29 | new hampshire | 0.430304 |
| 30 | new jersey | 0.315145 |
| 31 | new mexico | 0.379377 |
| 32 | new york | 0.314120 |
| 33 | north carolina | 0.360640 |
| 34 | north dakota | 0.449385 |
| 35 | ohio | 0.435944 |
| 36 | oklahoma | 0.279380 |
| 37 | oregon | 0.477528 |
| 38 | pennsylvania | 0.395355 |
| 39 | rhode island | 0.401321 |
| 40 | south carolina | 0.377340 |
| 42 | tennessee | 0.318277 |
| 43 | texas | 0.251846 |
| 44 | utah | 0.336238 |
| 45 | vermont | 0.481203 |
| 46 | virginia | 0.348227 |
| 47 | washington | 0.477573 |
| 48 | west virginia | 0.350316 |
| 49 | wisconsin | 0.486815 |
"\nx_test = x.get(x['state'].isin(['california', 'florida', 'south dakota', 'wyoming']))\nx_test\n"
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('State', fontsize = 15)
ax.set_ylabel('Voter Turnout 2010', fontsize = 15)
ax.set_title('State Vs. Voter Turn out 2010', fontsize = 20, color='cornflowerblue')
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = fullh['state'] == target
ax.scatter(fullh.loc[indicesToKeep, 'state']
, fullh.loc[indicesToKeep, 'voter_turnout_2010']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
for col in x_train.columns:
if x_train[col].dtype == 'object':
x_train[col] = pd.factorize(x_train[col])[0]
x_test[col] = pd.factorize(x_test[col])[0]
x_train.dropna(inplace=True)
x_test.dropna(inplace=True)
for col in y_train.columns:
if y_train[col].dtype == 'object':
y_train[col] = pd.factorize(y_train[col])[0]
y_test[col] = pd.factorize(y_test[col])[0]
y_test.dropna(inplace=True)
y_train.dropna(inplace=True)
display(x_test)
display(y_train)
C:\Users\lover\anaconda3\lib\site-packages\pandas\util\_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return func(*args, **kwargs)
| state | pop2010 | age_under_18_2010 | can_vote_2010 | year | candidatevotes | |
|---|---|---|---|---|---|---|
| 4 | 0 | 37253956.0 | 23.658621 | 2.844018e+07 | 2010.0 | 9648096.0 |
| 9 | 1 | 18801310.0 | 20.776119 | 1.489513e+07 | 2010.0 | 5117811.0 |
| 41 | 2 | 814180.0 | 25.281818 | 6.083405e+05 | 2010.0 | 319426.0 |
| 50 | 3 | 563626.0 | 23.382609 | 4.318355e+05 | 2010.0 | 190822.0 |
| state | voter_turnout_2010 | |
|---|---|---|
| 0 | 0 | 0.373957 |
| 1 | 1 | 0.480629 |
| 2 | 2 | 0.355676 |
| 3 | 3 | 0.347230 |
| 5 | 4 | 0.449799 |
| 6 | 5 | 0.408936 |
| 7 | 6 | 0.441093 |
| 10 | 8 | 0.337745 |
| 11 | 9 | 0.323401 |
| 12 | 10 | 0.388148 |
| 13 | 11 | 0.372867 |
| 14 | 12 | 0.356601 |
| 15 | 13 | 0.475736 |
| 16 | 14 | 0.386056 |
| 17 | 15 | 0.407573 |
| 18 | 16 | 0.303543 |
| 19 | 17 | 0.533325 |
| 20 | 18 | 0.409690 |
| 21 | 19 | 0.429035 |
| 22 | 20 | 0.413136 |
| 23 | 21 | 0.516673 |
| 24 | 22 | 0.355195 |
| 25 | 23 | 0.419665 |
| 26 | 24 | 0.468732 |
| 27 | 25 | 0.348481 |
| 28 | 26 | 0.337766 |
| 29 | 27 | 0.430304 |
| 30 | 28 | 0.315145 |
| 31 | 29 | 0.379377 |
| 32 | 30 | 0.314120 |
| 33 | 31 | 0.360640 |
| 34 | 32 | 0.449385 |
| 35 | 33 | 0.435944 |
| 36 | 34 | 0.279380 |
| 37 | 35 | 0.477528 |
| 38 | 36 | 0.395355 |
| 39 | 37 | 0.401321 |
| 40 | 38 | 0.377340 |
| 42 | 39 | 0.318277 |
| 43 | 40 | 0.251846 |
| 44 | 41 | 0.336238 |
| 45 | 42 | 0.481203 |
| 46 | 43 | 0.348227 |
| 47 | 44 | 0.477573 |
| 48 | 45 | 0.350316 |
| 49 | 46 | 0.486815 |
import time
start = time.time()
dtreg = DecisionTreeRegressor(max_depth=10)
dtreg.fit(x_train, y_train['voter_turnout_2010'])
dtrPreds = dtreg.predict(x_test)
mse = mean_squared_error(y_test['voter_turnout_2010'], dtrPreds)
print("Decision Tree Regressor MSE:",mse," with a run time of ", time.time()-start,' seconds')
Decision Tree Regressor MSE: 0.0018329825128221636 with a run time of 0.007996082305908203 seconds
start = time.time()
adreg = AdaBoostRegressor(n_estimators=200,random_state=20)
adreg.fit(x_train, y_train['voter_turnout_2010'])
adrPreds = adreg.predict(x_test)
mse = mean_squared_error(y_test['voter_turnout_2010'], adrPreds)
print("Adaboost Regressor MSE:",mse," with a run time of ", time.time()-start,' seconds')
Adaboost Regressor MSE: 0.0014155571564051278 with a run time of 0.5939323902130127 seconds
start = time.time()
rfreg = RandomForestRegressor(n_estimators=200,random_state=20)
rfreg.fit(x_train, y_train['voter_turnout_2010'])
rfrPreds = rfreg.predict(x_test)
mse = mean_squared_error(y_test['voter_turnout_2010'], rfrPreds)
print("Random Forest Regressor MSE:",mse," with a run time of ", time.time()-start,' seconds')
Random Forest Regressor MSE: 0.001843402735859305 with a run time of 0.6360979080200195 seconds
start = time.time()
gbreg = GradientBoostingRegressor(n_estimators=200,random_state=20)
gbreg.fit(x_train, y_train['voter_turnout_2010'])
gbrPreds = gbreg.predict(x_test)
mse = mean_squared_error(y_test['voter_turnout_2010'], gbrPreds)
print("Gradientboost Regressor MSE:",mse," with a run time of ", time.time()-start,' seconds')
Gradientboost Regressor MSE: 0.0012963654826801318 with a run time of 0.13191962242126465 seconds
houseg12 = house.groupby(['state', 'year'], as_index=False).agg({'candidatevotes':'sum'})
houseg12 = houseg12.groupby('year').get_group((2012))
houseg12
| state | year | candidatevotes | |
|---|---|---|---|
| 18 | alabama | 2012 | 1933630 |
| 41 | alaska | 2012 | 289804 |
| 64 | arizona | 2012 | 2173317 |
| 87 | arkansas | 2012 | 1038054 |
| 110 | california | 2012 | 12204357 |
| 133 | colorado | 2012 | 2450488 |
| 156 | connecticut | 2012 | 1465510 |
| 179 | delaware | 2012 | 388059 |
| 203 | florida | 2012 | 7513536 |
| 226 | georgia | 2012 | 3552967 |
| 249 | hawaii | 2012 | 437159 |
| 272 | idaho | 2012 | 635218 |
| 295 | illinois | 2012 | 5058133 |
| 318 | indiana | 2012 | 2553746 |
| 341 | iowa | 2012 | 1536849 |
| 364 | kansas | 2012 | 1057739 |
| 387 | kentucky | 2012 | 1745377 |
| 410 | louisiana | 2012 | 1705617 |
| 433 | maine | 2012 | 724623 |
| 456 | maryland | 2012 | 2585514 |
| 479 | massachusetts | 2012 | 3184196 |
| 502 | michigan | 2012 | 4574632 |
| 525 | minnesota | 2012 | 2813383 |
| 548 | mississippi | 2012 | 1208175 |
| 571 | missouri | 2012 | 2675900 |
| 594 | montana | 2012 | 479740 |
| 617 | nebraska | 2012 | 772515 |
| 640 | nevada | 2012 | 973742 |
| 663 | new hampshire | 2012 | 682416 |
| 686 | new jersey | 2012 | 3281778 |
| 709 | new mexico | 2012 | 766090 |
| 732 | new york | 2012 | 7116336 |
| 755 | north carolina | 2012 | 4384112 |
| 778 | north dakota | 2012 | 316224 |
| 801 | ohio | 2012 | 5142126 |
| 824 | oklahoma | 2012 | 1325935 |
| 847 | oregon | 2012 | 1708168 |
| 870 | pennsylvania | 2012 | 5556330 |
| 893 | rhode island | 2012 | 427775 |
| 916 | south carolina | 2012 | 1802734 |
| 939 | south dakota | 2012 | 361429 |
| 962 | tennessee | 2012 | 2283727 |
| 985 | texas | 2012 | 7664208 |
| 1008 | utah | 2012 | 998897 |
| 1031 | vermont | 2012 | 289931 |
| 1054 | virginia | 2012 | 3740455 |
| 1077 | washington | 2012 | 3006266 |
| 1100 | west virginia | 2012 | 641354 |
| 1123 | wisconsin | 2012 | 2866050 |
| 1146 | wyoming | 2012 | 250700 |
for col in complete.columns:
if str(2010) in col or str(2012) in col or str(2008) in col :
print(col)
smoking_ban_2010 pop2010 pop2012 age_under_5_2010 age_under_18_2010 age_over_65_2010 female_2010 white_2010 black_2010 native_2010 asian_2010 pac_isl_2010 two_plus_races_2010 hispanic_2010 white_not_hispanic_2010 no_move_in_one_plus_year_2010 foreign_born_2010 foreign_spoken_at_home_2010 hs_grad_2010 bachelors_2010 veterans_2010 mean_work_travel_2010 housing_units_2010 homeownership_2010 housing_multi_unit_2010 median_val_owner_occupied_2010 households_2010 persons_per_household_2010 per_capita_income_2010 median_household_income_2010 building_permits_2010 area_2010 density_2010 poverty_2010 civilian_labor_force_2008 employed_2008 unemployed_2008 unemployment_rate_2008 civilian_labor_force_2010 employed_2010 unemployed_2010 unemployment_rate_2010 civilian_labor_force_2012 employed_2012 unemployed_2012 unemployment_rate_2012
for col in life.columns:
print(col)
fips state county year male life expectancy (years) years behind international frontier (male) female life expectancy (years) years behind international frontier (female) white male life expectancy (years) white female life expectancy (years) black male life expectancy (years) black female life expectancy (years) closest ranked countries for male life expectancy (higher) closest ranked countries for female life expectancy (higher) closest ranked countries for male life expectancy (lower) closest ranked countries for female life expectancy (lower) rank (male) rank (female) male life expectancy change 1987 to 2007 (years) female life expectancy change 1987 to 2007 (years) male life expectancy change 1987 to 1997 (years) female life expectancy change 1987 to 1997 (years) male life expectancy change 1997 to 2007 (years) female life expectancy change 1997 to 2007 (years)
compg12 = complete.groupby(['state'], as_index=False).agg({'pop2010':'sum', 'pop2012':'sum', 'age_under_18_2010': 'mean',
'hs_grad_2010': 'mean', 'age_over_65_2010':'mean'})
#compg12 = compg12[['state', 'pop2010', 'age_under_18_2010', 'hs_grad_2010', 'age_over_65_2010', 'pop2012']]
compg12.head(40)
| state | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | |
|---|---|---|---|---|---|---|
| 0 | alabama | 4779736.0 | 4.813946e+06 | 23.479104 | 76.782090 | 15.119403 |
| 1 | alaska | 710231.0 | 7.425877e+05 | 25.493103 | 86.593103 | 8.572414 |
| 2 | arizona | 6392017.0 | 6.544211e+06 | 25.306667 | 81.820000 | 16.526667 |
| 3 | arkansas | 2915918.0 | 2.949208e+06 | 23.542667 | 78.864000 | 16.686667 |
| 4 | california | 37253956.0 | 3.801901e+07 | 23.658621 | 82.458621 | 13.724138 |
| 5 | colorado | 5029196.0 | 5.186330e+06 | 22.057813 | 88.139062 | 14.396875 |
| 6 | connecticut | 3574097.0 | 3.597705e+06 | 22.125000 | 89.100000 | 14.112500 |
| 7 | delaware | 897934.0 | 9.168680e+05 | 22.833333 | 86.166667 | 15.533333 |
| 8 | district of columbia | 601723.0 | 6.356300e+05 | 16.800000 | 86.500000 | 11.400000 |
| 9 | florida | 18801310.0 | 1.934133e+07 | 20.776119 | 82.226866 | 18.074627 |
| 10 | georgia | 9687653.0 | 9.911171e+06 | 24.550314 | 77.172956 | 13.527673 |
| 11 | hawaii | 1360301.0 | 1.392772e+06 | 18.140000 | 87.420000 | 17.120000 |
| 12 | idaho | 1567582.0 | 1.594673e+06 | 26.511364 | 85.722727 | 14.915909 |
| 13 | illinois | 12830632.0 | 1.287849e+07 | 22.741176 | 86.344118 | 16.320588 |
| 14 | indiana | 6483802.0 | 6.535665e+06 | 24.410870 | 84.958696 | 14.528261 |
| 15 | iowa | 3046355.0 | 3.074386e+06 | 23.644444 | 88.998990 | 17.966667 |
| 16 | kansas | 2853118.0 | 2.885316e+06 | 24.143810 | 87.784762 | 18.093333 |
| 17 | kentucky | 4339367.0 | 4.383673e+06 | 23.425833 | 75.736667 | 14.533333 |
| 18 | louisiana | 4533372.0 | 4.602681e+06 | 24.717188 | 77.126563 | 13.215625 |
| 19 | maine | 1328361.0 | 1.328101e+06 | 20.337500 | 88.981250 | 17.143750 |
| 20 | maryland | 5773552.0 | 5.891680e+06 | 22.825000 | 87.091667 | 14.158333 |
| 21 | massachusetts | 6547629.0 | 6.659627e+06 | 20.821429 | 89.642857 | 14.800000 |
| 22 | michigan | 9883640.0 | 9.886610e+06 | 21.756627 | 87.491566 | 17.484337 |
| 23 | minnesota | 5303925.0 | 5.377695e+06 | 23.708046 | 88.998851 | 17.018391 |
| 24 | mississippi | 2967297.0 | 2.982963e+06 | 25.182927 | 75.954878 | 13.862195 |
| 25 | missouri | 5988927.0 | 6.023267e+06 | 23.580870 | 82.482609 | 16.790435 |
| 26 | montana | 989415.0 | 1.003522e+06 | 22.301786 | 88.460714 | 18.264286 |
| 27 | nebraska | 1826341.0 | 1.854862e+06 | 23.709677 | 89.673118 | 19.327957 |
| 28 | nevada | 2700551.0 | 2.752410e+06 | 22.952941 | 84.782353 | 15.917647 |
| 29 | new hampshire | 1316470.0 | 1.320923e+06 | 20.600000 | 90.030000 | 15.360000 |
| 30 | new jersey | 8791894.0 | 8.882095e+06 | 23.428571 | 87.428571 | 13.752381 |
| 31 | new mexico | 2059179.0 | 2.083590e+06 | 23.624242 | 81.636364 | 16.596970 |
| 32 | new york | 19378102.0 | 1.962541e+07 | 21.903226 | 86.861290 | 14.890323 |
| 33 | north carolina | 9535483.0 | 9.755299e+06 | 22.575000 | 80.282000 | 15.650000 |
| 34 | north dakota | 672591.0 | 7.013800e+05 | 21.805660 | 85.988679 | 20.192453 |
| 35 | ohio | 11536504.0 | 1.154697e+07 | 23.939773 | 85.865909 | 14.843182 |
| 36 | oklahoma | 3751351.0 | 3.815298e+06 | 24.337662 | 82.732468 | 16.038961 |
| 37 | oregon | 3831074.0 | 3.893920e+06 | 21.869444 | 87.586111 | 17.688889 |
| 38 | pennsylvania | 12702379.0 | 1.276803e+07 | 21.217910 | 86.647761 | 16.804478 |
| 39 | rhode island | 1052567.0 | 1.052761e+06 | 20.580000 | 87.540000 | 15.580000 |
popg = countyData.groupby(['state'], as_index=False).agg({'population': 'sum'})
popg
| state | population | |
|---|---|---|
| 0 | alabama | 4040587 |
| 1 | arizona | 3665228 |
| 2 | arkansas | 2350725 |
| 3 | california | 27030912 |
| 4 | colorado | 3294394 |
| 5 | connecticut | 3287116 |
| 6 | delaware | 666168 |
| 7 | florida | 12937926 |
| 8 | georgia | 6478216 |
| 9 | idaho | 1006749 |
| 10 | illinois | 11430602 |
| 11 | indiana | 5544159 |
| 12 | iowa | 2776755 |
| 13 | kansas | 2477574 |
| 14 | kentucky | 3685296 |
| 15 | louisiana | 3727191 |
| 16 | maine | 1227928 |
| 17 | maryland | 3316186 |
| 18 | massachusetts | 3429715 |
| 19 | michigan | 9231024 |
| 20 | minnesota | 4350415 |
| 21 | mississippi | 2505241 |
| 22 | missouri | 4642718 |
| 23 | montana | 751518 |
| 24 | nebraska | 1578385 |
| 25 | nevada | 1201833 |
| 26 | new hampshire | 303096 |
| 27 | new jersey | 7730188 |
| 28 | new mexico | 1515069 |
| 29 | new york | 17990455 |
| 30 | north carolina | 3636663 |
| 31 | north dakota | 566009 |
| 32 | ohio | 10847115 |
| 33 | oklahoma | 3145585 |
| 34 | oregon | 2842321 |
| 35 | pennsylvania | 7969652 |
| 36 | rhode island | 297188 |
| 37 | south carolina | 975853 |
| 38 | south dakota | 589915 |
| 39 | tennessee | 4877185 |
| 40 | texas | 16986510 |
| 41 | utah | 1722850 |
| 42 | vermont | 562758 |
| 43 | virginia | 3738146 |
| 44 | washington | 4793602 |
| 45 | west virginia | 1599249 |
| 46 | wisconsin | 4891769 |
| 47 | wyoming | 453588 |
fullh12 = pd.merge(popg, compg12)
fullh12
| state | population | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | |
|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 |
| 1 | arizona | 3665228 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 |
| 2 | arkansas | 2350725 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 |
| 3 | california | 27030912 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 |
| 4 | colorado | 3294394 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 |
| 5 | connecticut | 3287116 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 |
| 6 | delaware | 666168 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 |
| 7 | florida | 12937926 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 |
| 8 | georgia | 6478216 | 9687653.0 | 9911171.0 | 24.550314 | 77.172956 | 13.527673 |
| 9 | idaho | 1006749 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 |
| 10 | illinois | 11430602 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 |
| 11 | indiana | 5544159 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 |
| 12 | iowa | 2776755 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 |
| 13 | kansas | 2477574 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 |
| 14 | kentucky | 3685296 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 |
| 15 | louisiana | 3727191 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 |
| 16 | maine | 1227928 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 |
| 17 | maryland | 3316186 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 |
| 18 | massachusetts | 3429715 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 |
| 19 | michigan | 9231024 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 |
| 20 | minnesota | 4350415 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 |
| 21 | mississippi | 2505241 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 |
| 22 | missouri | 4642718 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 |
| 23 | montana | 751518 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 |
| 24 | nebraska | 1578385 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 |
| 25 | nevada | 1201833 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 |
| 26 | new hampshire | 303096 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 |
| 27 | new jersey | 7730188 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 |
| 28 | new mexico | 1515069 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 |
| 29 | new york | 17990455 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 |
| 30 | north carolina | 3636663 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 |
| 31 | north dakota | 566009 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 |
| 32 | ohio | 10847115 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 |
| 33 | oklahoma | 3145585 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 |
| 34 | oregon | 2842321 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 |
| 35 | pennsylvania | 7969652 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 |
| 36 | rhode island | 297188 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 |
| 37 | south carolina | 975853 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 |
| 38 | south dakota | 589915 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 |
| 39 | tennessee | 4877185 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 |
| 40 | texas | 16986510 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 |
| 41 | utah | 1722850 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 |
| 42 | vermont | 562758 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 |
| 43 | virginia | 3738146 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 |
| 44 | washington | 4793602 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 |
| 45 | west virginia | 1599249 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 |
| 46 | wisconsin | 4891769 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 |
| 47 | wyoming | 453588 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 |
fullh12 = pd.concat([fullh12, fullh[['candidatevotes', 'can_vote_2010', 'voter_turnout_2010']]], axis = 1)
fullh12.rename(columns={'population':'pop1990'}, inplace=True)
fullh12.dropna(inplace=True)
fullh12.reset_index(drop=True, inplace=True)
fullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes | can_vote_2010 | voter_turnout_2010 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 |
| 1 | arizona | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 |
| 2 | arkansas | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 |
| 3 | california | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 |
| 4 | colorado | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 |
| 5 | connecticut | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 |
| 6 | delaware | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 |
| 7 | florida | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 |
| 8 | idaho | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 |
| 9 | illinois | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 |
| 10 | indiana | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 |
| 11 | iowa | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 |
| 12 | kansas | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 |
| 13 | kentucky | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 |
| 14 | louisiana | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 |
| 15 | maine | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 |
| 16 | maryland | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 |
| 17 | massachusetts | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 |
| 18 | michigan | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 |
| 19 | minnesota | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 |
| 20 | mississippi | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 |
| 21 | missouri | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 |
| 22 | montana | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 |
| 23 | nebraska | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 |
| 24 | nevada | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 |
| 25 | new hampshire | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 |
| 26 | new jersey | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 |
| 27 | new mexico | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 |
| 28 | new york | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 |
| 29 | north carolina | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 |
| 30 | north dakota | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 |
| 31 | ohio | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 |
| 32 | oklahoma | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 |
| 33 | oregon | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 |
| 34 | pennsylvania | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 |
| 35 | rhode island | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 |
| 36 | south carolina | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 |
| 37 | south dakota | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 |
| 38 | tennessee | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 |
| 39 | texas | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 |
| 40 | utah | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 |
| 41 | vermont | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 |
| 42 | virginia | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 |
| 43 | washington | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 |
| 44 | west virginia | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 |
| 45 | wisconsin | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 |
| 46 | wyoming | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 |
newtest12 = house.groupby(['state','year'], as_index=False).agg({'candidatevotes': 'sum'})
newtest12 = newtest12.groupby('year').get_group((2012))
newtest12
| state | year | candidatevotes | |
|---|---|---|---|
| 18 | alabama | 2012 | 1933630 |
| 41 | alaska | 2012 | 289804 |
| 64 | arizona | 2012 | 2173317 |
| 87 | arkansas | 2012 | 1038054 |
| 110 | california | 2012 | 12204357 |
| 133 | colorado | 2012 | 2450488 |
| 156 | connecticut | 2012 | 1465510 |
| 179 | delaware | 2012 | 388059 |
| 203 | florida | 2012 | 7513536 |
| 226 | georgia | 2012 | 3552967 |
| 249 | hawaii | 2012 | 437159 |
| 272 | idaho | 2012 | 635218 |
| 295 | illinois | 2012 | 5058133 |
| 318 | indiana | 2012 | 2553746 |
| 341 | iowa | 2012 | 1536849 |
| 364 | kansas | 2012 | 1057739 |
| 387 | kentucky | 2012 | 1745377 |
| 410 | louisiana | 2012 | 1705617 |
| 433 | maine | 2012 | 724623 |
| 456 | maryland | 2012 | 2585514 |
| 479 | massachusetts | 2012 | 3184196 |
| 502 | michigan | 2012 | 4574632 |
| 525 | minnesota | 2012 | 2813383 |
| 548 | mississippi | 2012 | 1208175 |
| 571 | missouri | 2012 | 2675900 |
| 594 | montana | 2012 | 479740 |
| 617 | nebraska | 2012 | 772515 |
| 640 | nevada | 2012 | 973742 |
| 663 | new hampshire | 2012 | 682416 |
| 686 | new jersey | 2012 | 3281778 |
| 709 | new mexico | 2012 | 766090 |
| 732 | new york | 2012 | 7116336 |
| 755 | north carolina | 2012 | 4384112 |
| 778 | north dakota | 2012 | 316224 |
| 801 | ohio | 2012 | 5142126 |
| 824 | oklahoma | 2012 | 1325935 |
| 847 | oregon | 2012 | 1708168 |
| 870 | pennsylvania | 2012 | 5556330 |
| 893 | rhode island | 2012 | 427775 |
| 916 | south carolina | 2012 | 1802734 |
| 939 | south dakota | 2012 | 361429 |
| 962 | tennessee | 2012 | 2283727 |
| 985 | texas | 2012 | 7664208 |
| 1008 | utah | 2012 | 998897 |
| 1031 | vermont | 2012 | 289931 |
| 1054 | virginia | 2012 | 3740455 |
| 1077 | washington | 2012 | 3006266 |
| 1100 | west virginia | 2012 | 641354 |
| 1123 | wisconsin | 2012 | 2866050 |
| 1146 | wyoming | 2012 | 250700 |
newtest12.reset_index(drop=True, inplace=True)
newtest12.rename(columns={'candidatevotes': 'candidatevotes_2012'}, inplace=True)
fullh12.rename(columns={'candidatevotes': 'candidatevotes_2010'},inplace=True)
fullh12 = pd.concat([fullh12, newtest12[['candidatevotes_2012']]], axis = 1)
fullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes_2010 | can_vote_2010 | voter_turnout_2010 | candidatevotes_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 | 1933630 |
| 1 | arizona | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 | 289804 |
| 2 | arkansas | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 | 2173317 |
| 3 | california | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 | 1038054 |
| 4 | colorado | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 | 12204357 |
| 5 | connecticut | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 | 2450488 |
| 6 | delaware | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 | 1465510 |
| 7 | florida | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 | 388059 |
| 8 | idaho | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 | 7513536 |
| 9 | illinois | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 | 3552967 |
| 10 | indiana | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 | 437159 |
| 11 | iowa | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 | 635218 |
| 12 | kansas | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 | 5058133 |
| 13 | kentucky | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 | 2553746 |
| 14 | louisiana | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 | 1536849 |
| 15 | maine | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 | 1057739 |
| 16 | maryland | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 | 1745377 |
| 17 | massachusetts | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 | 1705617 |
| 18 | michigan | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 | 724623 |
| 19 | minnesota | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 | 2585514 |
| 20 | mississippi | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 | 3184196 |
| 21 | missouri | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 | 4574632 |
| 22 | montana | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 | 2813383 |
| 23 | nebraska | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 | 1208175 |
| 24 | nevada | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 | 2675900 |
| 25 | new hampshire | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 | 479740 |
| 26 | new jersey | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 | 772515 |
| 27 | new mexico | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 | 973742 |
| 28 | new york | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 | 682416 |
| 29 | north carolina | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 | 3281778 |
| 30 | north dakota | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 | 766090 |
| 31 | ohio | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 | 7116336 |
| 32 | oklahoma | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 | 4384112 |
| 33 | oregon | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 | 316224 |
| 34 | pennsylvania | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 | 5142126 |
| 35 | rhode island | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 | 1325935 |
| 36 | south carolina | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 | 1708168 |
| 37 | south dakota | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 | 5556330 |
| 38 | tennessee | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 | 427775 |
| 39 | texas | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 | 1802734 |
| 40 | utah | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 | 361429 |
| 41 | vermont | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 | 2283727 |
| 42 | virginia | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 | 7664208 |
| 43 | washington | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 | 998897 |
| 44 | west virginia | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 | 289931 |
| 45 | wisconsin | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 | 3740455 |
| 46 | wyoming | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 | 3006266 |
| 47 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 641354 |
| 48 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2866050 |
| 49 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 250700 |
As we can notice, the population is rising, and the life expectancy as well, so we can expect a minor increase in the number of possible voters. as hs_grad_2010 who will turn 18 and will be able to vote in 2012 are more than who die among them or among age_over_65_2010.
1 - hs_grad_2010 is the mean percentage in state among age_under_18 (which represents the mean perecentage in the state).
2- age_under_18_2010 is calculated by mean in state and represents percentage.
3- age_over_65_2010 also is the mean perecentage in a state which are of age 65 or older.
4- voters are aged 18 and older, so we calculate them by dividing the percentage by 100 and get the complementary to 1 and mutliply by pop2010. so we got number of voters in 2010.
5- candidatevotes , is the sum in the given year of all votes in a state, so the voter_turnout is candidatevotes/can_vote.
Now for 2010 we had the numbers, for 2012 we will check the ratio of hs_grad_2010/age_over_65_2010 and multiply with pop2012, and we already got the number of candidatevotes_2012.
fullh12.dropna(inplace=True)
fullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes_2010 | can_vote_2010 | voter_turnout_2010 | candidatevotes_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 | 1933630 |
| 1 | arizona | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 | 289804 |
| 2 | arkansas | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 | 2173317 |
| 3 | california | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 | 1038054 |
| 4 | colorado | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 | 12204357 |
| 5 | connecticut | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 | 2450488 |
| 6 | delaware | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 | 1465510 |
| 7 | florida | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 | 388059 |
| 8 | idaho | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 | 7513536 |
| 9 | illinois | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 | 3552967 |
| 10 | indiana | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 | 437159 |
| 11 | iowa | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 | 635218 |
| 12 | kansas | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 | 5058133 |
| 13 | kentucky | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 | 2553746 |
| 14 | louisiana | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 | 1536849 |
| 15 | maine | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 | 1057739 |
| 16 | maryland | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 | 1745377 |
| 17 | massachusetts | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 | 1705617 |
| 18 | michigan | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 | 724623 |
| 19 | minnesota | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 | 2585514 |
| 20 | mississippi | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 | 3184196 |
| 21 | missouri | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 | 4574632 |
| 22 | montana | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 | 2813383 |
| 23 | nebraska | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 | 1208175 |
| 24 | nevada | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 | 2675900 |
| 25 | new hampshire | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 | 479740 |
| 26 | new jersey | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 | 772515 |
| 27 | new mexico | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 | 973742 |
| 28 | new york | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 | 682416 |
| 29 | north carolina | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 | 3281778 |
| 30 | north dakota | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 | 766090 |
| 31 | ohio | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 | 7116336 |
| 32 | oklahoma | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 | 4384112 |
| 33 | oregon | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 | 316224 |
| 34 | pennsylvania | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 | 5142126 |
| 35 | rhode island | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 | 1325935 |
| 36 | south carolina | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 | 1708168 |
| 37 | south dakota | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 | 5556330 |
| 38 | tennessee | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 | 427775 |
| 39 | texas | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 | 1802734 |
| 40 | utah | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 | 361429 |
| 41 | vermont | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 | 2283727 |
| 42 | virginia | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 | 7664208 |
| 43 | washington | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 | 998897 |
| 44 | west virginia | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 | 289931 |
| 45 | wisconsin | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 | 3740455 |
| 46 | wyoming | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 | 3006266 |
fullh12['multi_ratio'] = (fullh12['hs_grad_2010']/100 * fullh12['age_under_18_2010']/100) / fullh12['age_over_65_2010'] + 1
fullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes_2010 | can_vote_2010 | voter_turnout_2010 | candidatevotes_2012 | multi_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 | 1933630 | 1.011924 |
| 1 | arizona | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 | 289804 | 1.012529 |
| 2 | arkansas | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 | 2173317 | 1.011127 |
| 3 | california | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 | 1038054 | 1.014215 |
| 4 | colorado | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 | 12204357 | 1.013504 |
| 5 | connecticut | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 | 2450488 | 1.013969 |
| 6 | delaware | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 | 1465510 | 1.012666 |
| 7 | florida | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 | 388059 | 1.009452 |
| 8 | idaho | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 | 7513536 | 1.015236 |
| 9 | illinois | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 | 3552967 | 1.012031 |
| 10 | indiana | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 | 437159 | 1.014275 |
| 11 | iowa | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 | 635218 | 1.011712 |
| 12 | kansas | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 | 5058133 | 1.011714 |
| 13 | kentucky | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 | 2553746 | 1.012208 |
| 14 | louisiana | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 | 1536849 | 1.014425 |
| 15 | maine | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 | 1057739 | 1.010556 |
| 16 | maryland | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 | 1745377 | 1.014040 |
| 17 | massachusetts | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 | 1705617 | 1.012611 |
| 18 | michigan | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 | 724623 | 1.010887 |
| 19 | minnesota | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 | 2585514 | 1.012398 |
| 20 | mississippi | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 | 3184196 | 1.013798 |
| 21 | missouri | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 | 4574632 | 1.011584 |
| 22 | montana | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 | 2813383 | 1.010802 |
| 23 | nebraska | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 | 1208175 | 1.011000 |
| 24 | nevada | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 | 2675900 | 1.012225 |
| 25 | new hampshire | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 | 479740 | 1.012074 |
| 26 | new jersey | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 | 772515 | 1.014894 |
| 27 | new mexico | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 | 973742 | 1.011620 |
| 28 | new york | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 | 682416 | 1.012777 |
| 29 | north carolina | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 | 3281778 | 1.011581 |
| 30 | north dakota | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 | 766090 | 1.009286 |
| 31 | ohio | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 | 7116336 | 1.013849 |
| 32 | oklahoma | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 | 4384112 | 1.012554 |
| 33 | oregon | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 | 316224 | 1.010829 |
| 34 | pennsylvania | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 | 5142126 | 1.010940 |
| 35 | rhode island | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 | 1325935 | 1.011563 |
| 36 | south carolina | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 | 1708168 | 1.012572 |
| 37 | south dakota | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 | 5556330 | 1.012378 |
| 38 | tennessee | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 | 427775 | 1.011239 |
| 39 | texas | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 | 1802734 | 1.012327 |
| 40 | utah | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 | 361429 | 1.022879 |
| 41 | vermont | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 | 2283727 | 1.012103 |
| 42 | virginia | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 | 7664208 | 1.011065 |
| 43 | washington | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 | 998897 | 1.012299 |
| 44 | west virginia | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 | 289931 | 1.009849 |
| 45 | wisconsin | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 | 3740455 | 1.012283 |
| 46 | wyoming | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 | 3006266 | 1.014787 |
fullh12['can_vote_2012'] = fullh12['multi_ratio'] * fullh12['can_vote_2010']
fullh12['voter_turnout_2012'] = fullh12['candidatevotes_2012'] / fullh12['can_vote_2012']
fullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes_2010 | can_vote_2010 | voter_turnout_2010 | candidatevotes_2012 | multi_ratio | can_vote_2012 | voter_turnout_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | alabama | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 | 1933630 | 1.011924 | 3.701107e+06 | 0.522446 |
| 1 | arizona | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 | 289804 | 1.012529 | 5.358010e+05 | 0.540880 |
| 2 | arkansas | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 | 2173317 | 1.011127 | 4.827534e+06 | 0.450192 |
| 3 | california | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 | 1038054 | 1.014215 | 2.261124e+06 | 0.459088 |
| 4 | colorado | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 | 12204357 | 1.013504 | 2.882424e+07 | 0.423406 |
| 5 | connecticut | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 | 2450488 | 1.013969 | 3.974621e+06 | 0.616534 |
| 6 | delaware | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 | 1465510 | 1.012666 | 2.818582e+06 | 0.519946 |
| 7 | florida | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 | 388059 | 1.009452 | 6.994549e+05 | 0.554802 |
| 8 | idaho | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 | 7513536 | 1.015236 | 1.512207e+07 | 0.496859 |
| 9 | illinois | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 | 3552967 | 1.012031 | 7.397244e+06 | 0.480310 |
| 10 | indiana | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 | 437159 | 1.014275 | 1.129438e+06 | 0.387059 |
| 11 | iowa | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 | 635218 | 1.011712 | 1.165487e+06 | 0.545024 |
| 12 | kansas | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 | 5058133 | 1.011714 | 1.002891e+07 | 0.504355 |
| 13 | kentucky | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 | 2553746 | 1.012208 | 4.960880e+06 | 0.514777 |
| 14 | louisiana | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 | 1536849 | 1.014425 | 2.359615e+06 | 0.651314 |
| 15 | maine | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 | 1057739 | 1.010556 | 2.187112e+06 | 0.483624 |
| 16 | maryland | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 | 1745377 | 1.014040 | 3.369488e+06 | 0.517995 |
| 17 | massachusetts | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 | 1705617 | 1.012611 | 3.455891e+06 | 0.493539 |
| 18 | michigan | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 | 724623 | 1.010887 | 1.069726e+06 | 0.677391 |
| 19 | minnesota | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 | 2585514 | 1.012398 | 4.510982e+06 | 0.573160 |
| 20 | mississippi | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 | 3184196 | 1.013798 | 5.255855e+06 | 0.605838 |
| 21 | missouri | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 | 4574632 | 1.011584 | 7.822876e+06 | 0.584776 |
| 22 | montana | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 | 2813383 | 1.010802 | 4.090176e+06 | 0.687839 |
| 23 | nebraska | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 | 1208175 | 1.011000 | 2.244466e+06 | 0.538291 |
| 24 | nevada | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 | 2675900 | 1.012225 | 4.632638e+06 | 0.577619 |
| 25 | new hampshire | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 | 479740 | 1.012074 | 7.780400e+05 | 0.616601 |
| 26 | new jersey | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 | 772515 | 1.014894 | 1.414074e+06 | 0.546304 |
| 27 | new mexico | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 | 973742 | 1.011620 | 2.104873e+06 | 0.462613 |
| 28 | new york | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 | 682416 | 1.012777 | 1.058633e+06 | 0.644620 |
| 29 | north carolina | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 | 3281778 | 1.011581 | 6.810040e+06 | 0.481903 |
| 30 | north dakota | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 | 766090 | 1.009286 | 1.587318e+06 | 0.482632 |
| 31 | ohio | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 | 7116336 | 1.013849 | 1.534326e+07 | 0.463809 |
| 32 | oklahoma | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 | 4384112 | 1.012554 | 7.475531e+06 | 0.586462 |
| 33 | oregon | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 | 316224 | 1.010829 | 5.316232e+05 | 0.594827 |
| 34 | pennsylvania | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 | 5142126 | 1.010940 | 8.870690e+06 | 0.579676 |
| 35 | rhode island | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 | 1325935 | 1.011563 | 2.871181e+06 | 0.461808 |
| 36 | south carolina | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 | 1708168 | 1.012572 | 3.030870e+06 | 0.563590 |
| 37 | south dakota | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 | 5556330 | 1.012378 | 1.013107e+07 | 0.548444 |
| 38 | tennessee | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 | 427775 | 1.011239 | 8.453436e+05 | 0.506037 |
| 39 | texas | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 | 1802734 | 1.012327 | 3.595458e+06 | 0.501392 |
| 40 | utah | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 | 361429 | 1.022879 | 6.222588e+05 | 0.580834 |
| 41 | vermont | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 | 2283727 | 1.012103 | 4.957948e+06 | 0.460619 |
| 42 | virginia | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 | 7664208 | 1.011065 | 1.905153e+07 | 0.402288 |
| 43 | washington | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 | 998897 | 1.012299 | 1.928317e+06 | 0.518015 |
| 44 | west virginia | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 | 289931 | 1.009849 | 5.005584e+05 | 0.579215 |
| 45 | wisconsin | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 | 3740455 | 1.012283 | 6.365785e+06 | 0.587587 |
| 46 | wyoming | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 | 3006266 | 1.014787 | 5.268460e+06 | 0.570616 |
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('State', fontsize = 15)
ax.set_ylabel('Voter Turnout 2012', fontsize = 15)
ax.set_title('State Vs. Voter Turn out 2012', fontsize = 20, color='cornflowerblue')
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = fullh12['state'] == target
ax.scatter(fullh12.loc[indicesToKeep, 'state']
, fullh12.loc[indicesToKeep, 'voter_turnout_2012']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('State', fontsize = 15)
ax.set_ylabel('Voter Turnout 2010', fontsize = 15)
ax.set_title('State Vs. Voter Turn out 2010', fontsize = 20, color='cornflowerblue')
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = fullh12['state'] == target
ax.scatter(fullh12.loc[indicesToKeep, 'state']
, fullh12.loc[indicesToKeep, 'voter_turnout_2010']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
nfullh12 = fullh12.copy()
nfullh12['state'] = pd.factorize(nfullh12['state'])[0]
nfullh12
| state | pop1990 | pop2010 | pop2012 | age_under_18_2010 | hs_grad_2010 | age_over_65_2010 | candidatevotes_2010 | can_vote_2010 | voter_turnout_2010 | candidatevotes_2012 | multi_ratio | can_vote_2012 | voter_turnout_2012 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 4040587.0 | 4779736.0 | 4813946.0 | 23.479104 | 76.782090 | 15.119403 | 1367747.0 | 3.657497e+06 | 0.373957 | 1933630 | 1.011924 | 3.701107e+06 | 0.522446 |
| 1 | 1 | 3665228.0 | 6392017.0 | 6544211.0 | 25.306667 | 81.820000 | 16.526667 | 254335.0 | 5.291711e+05 | 0.480629 | 289804 | 1.012529 | 5.358010e+05 | 0.540880 |
| 2 | 2 | 2350725.0 | 2915918.0 | 2949208.0 | 23.542667 | 78.864000 | 16.686667 | 1698145.0 | 4.774411e+06 | 0.355676 | 2173317 | 1.011127 | 4.827534e+06 | 0.450192 |
| 3 | 3 | 27030912.0 | 37253956.0 | 38019006.0 | 23.658621 | 82.458621 | 13.724138 | 774125.0 | 2.229433e+06 | 0.347230 | 1038054 | 1.014215 | 2.261124e+06 | 0.459088 |
| 4 | 4 | 3294394.0 | 5029196.0 | 5186330.0 | 22.057813 | 88.139062 | 14.396875 | 9648096.0 | 2.844018e+07 | 0.339242 | 12204357 | 1.013504 | 2.882424e+07 | 0.423406 |
| 5 | 5 | 3287116.0 | 3574097.0 | 3597705.0 | 22.125000 | 89.100000 | 14.112500 | 1763152.0 | 3.919865e+06 | 0.449799 | 2450488 | 1.013969 | 3.974621e+06 | 0.616534 |
| 6 | 6 | 666168.0 | 897934.0 | 916868.0 | 22.833333 | 86.166667 | 15.533333 | 1138202.0 | 2.783328e+06 | 0.408936 | 1465510 | 1.012666 | 2.818582e+06 | 0.519946 |
| 7 | 7 | 12937926.0 | 18801310.0 | 19341327.0 | 20.776119 | 82.226866 | 18.074627 | 305636.0 | 6.929057e+05 | 0.441093 | 388059 | 1.009452 | 6.994549e+05 | 0.554802 |
| 8 | 8 | 1006749.0 | 1567582.0 | 1594673.0 | 26.511364 | 85.722727 | 14.915909 | 5117811.0 | 1.489513e+07 | 0.343590 | 7513536 | 1.015236 | 1.512207e+07 | 0.496859 |
| 9 | 9 | 11430602.0 | 12830632.0 | 12878494.0 | 22.741176 | 86.344118 | 16.320588 | 2468680.0 | 7.309304e+06 | 0.337745 | 3552967 | 1.012031 | 7.397244e+06 | 0.480310 |
| 10 | 10 | 5544159.0 | 6483802.0 | 6535665.0 | 24.410870 | 84.958696 | 14.528261 | 360121.0 | 1.113542e+06 | 0.323401 | 437159 | 1.014275 | 1.129438e+06 | 0.387059 |
| 11 | 11 | 2776755.0 | 3046355.0 | 3074386.0 | 23.644444 | 88.998990 | 17.966667 | 447144.0 | 1.151995e+06 | 0.388148 | 635218 | 1.011712 | 1.165487e+06 | 0.545024 |
| 12 | 12 | 2477574.0 | 2853118.0 | 2885316.0 | 24.143810 | 87.784762 | 18.093333 | 3696159.0 | 9.912795e+06 | 0.372867 | 5058133 | 1.011714 | 1.002891e+07 | 0.504355 |
| 13 | 13 | 3685296.0 | 4339367.0 | 4383673.0 | 23.425833 | 75.736667 | 14.533333 | 1747720.0 | 4.901050e+06 | 0.356601 | 2553746 | 1.012208 | 4.960880e+06 | 0.514777 |
| 14 | 14 | 3727191.0 | 4533372.0 | 4602681.0 | 24.717188 | 77.126563 | 13.215625 | 1106591.0 | 2.326061e+06 | 0.475736 | 1536849 | 1.014425 | 2.359615e+06 | 0.651314 |
| 15 | 15 | 1227928.0 | 1328361.0 | 1328101.0 | 20.337500 | 88.981250 | 17.143750 | 835529.0 | 2.164267e+06 | 0.386056 | 1057739 | 1.010556 | 2.187112e+06 | 0.483624 |
| 16 | 16 | 3316186.0 | 5773552.0 | 5891680.0 | 22.825000 | 87.091667 | 14.158333 | 1354298.0 | 3.322834e+06 | 0.407573 | 1745377 | 1.014040 | 3.369488e+06 | 0.517995 |
| 17 | 17 | 3429715.0 | 6547629.0 | 6659627.0 | 20.821429 | 89.642857 | 14.800000 | 1035948.0 | 3.412850e+06 | 0.303543 | 1705617 | 1.012611 | 3.455891e+06 | 0.493539 |
| 18 | 18 | 9231024.0 | 9883640.0 | 9886610.0 | 21.756627 | 87.491566 | 17.484337 | 564368.0 | 1.058206e+06 | 0.533325 | 724623 | 1.010887 | 1.069726e+06 | 0.677391 |
| 19 | 19 | 4350415.0 | 5303925.0 | 5377695.0 | 23.708046 | 88.998851 | 17.018391 | 1825472.0 | 4.455739e+06 | 0.409690 | 2585514 | 1.012398 | 4.510982e+06 | 0.573160 |
| 20 | 20 | 2505241.0 | 2967297.0 | 2982963.0 | 25.182927 | 75.954878 | 13.862195 | 2224255.0 | 5.184319e+06 | 0.429035 | 3184196 | 1.013798 | 5.255855e+06 | 0.605838 |
| 21 | 21 | 4642718.0 | 5988927.0 | 6023267.0 | 23.580870 | 82.482609 | 16.790435 | 3194901.0 | 7.733293e+06 | 0.413136 | 4574632 | 1.011584 | 7.822876e+06 | 0.584776 |
| 22 | 22 | 751518.0 | 989415.0 | 1003522.0 | 22.301786 | 88.460714 | 18.264286 | 2090701.0 | 4.046468e+06 | 0.516673 | 2813383 | 1.010802 | 4.090176e+06 | 0.687839 |
| 23 | 23 | 1578385.0 | 1826341.0 | 1854862.0 | 23.709677 | 89.673118 | 19.327957 | 788549.0 | 2.220045e+06 | 0.355195 | 1208175 | 1.011000 | 2.244466e+06 | 0.538291 |
| 24 | 24 | 1201833.0 | 2700551.0 | 2752410.0 | 22.952941 | 84.782353 | 15.917647 | 1920675.0 | 4.576686e+06 | 0.419665 | 2675900 | 1.012225 | 4.632638e+06 | 0.577619 |
| 25 | 25 | 303096.0 | 1316470.0 | 1320923.0 | 20.600000 | 90.030000 | 15.360000 | 360341.0 | 7.687578e+05 | 0.468732 | 479740 | 1.012074 | 7.780400e+05 | 0.616601 |
| 26 | 26 | 7730188.0 | 8791894.0 | 8882095.0 | 23.428571 | 87.428571 | 13.752381 | 485546.0 | 1.393321e+06 | 0.348481 | 772515 | 1.014894 | 1.414074e+06 | 0.546304 |
| 27 | 27 | 1515069.0 | 2059179.0 | 2083590.0 | 23.624242 | 81.636364 | 16.596970 | 702788.0 | 2.080695e+06 | 0.337766 | 973742 | 1.011620 | 2.104873e+06 | 0.462613 |
| 28 | 28 | 17990455.0 | 19378102.0 | 19625409.0 | 21.903226 | 86.861290 | 14.890323 | 449787.0 | 1.045277e+06 | 0.430304 | 682416 | 1.012777 | 1.058633e+06 | 0.644620 |
| 29 | 29 | 3636663.0 | 9535483.0 | 9755299.0 | 22.575000 | 80.282000 | 15.650000 | 2121584.0 | 6.732079e+06 | 0.315145 | 3281778 | 1.011581 | 6.810040e+06 | 0.481903 |
| 30 | 30 | 566009.0 | 672591.0 | 701380.0 | 21.805660 | 85.988679 | 20.192453 | 596651.0 | 1.572714e+06 | 0.379377 | 766090 | 1.009286 | 1.587318e+06 | 0.482632 |
| 31 | 31 | 10847115.0 | 11536504.0 | 11546969.0 | 23.939773 | 85.865909 | 14.843182 | 4753783.0 | 1.513367e+07 | 0.314120 | 7116336 | 1.013849 | 1.534326e+07 | 0.463809 |
| 32 | 32 | 3145585.0 | 3751351.0 | 3815298.0 | 24.337662 | 82.732468 | 16.038961 | 2662549.0 | 7.382848e+06 | 0.360640 | 4384112 | 1.012554 | 7.475531e+06 | 0.586462 |
| 33 | 33 | 2842321.0 | 3831074.0 | 3893920.0 | 21.869444 | 87.586111 | 17.688889 | 236344.0 | 5.259281e+05 | 0.449385 | 316224 | 1.010829 | 5.316232e+05 | 0.594827 |
| 34 | 34 | 7969652.0 | 12702379.0 | 12768034.0 | 21.217910 | 86.647761 | 16.804478 | 3825274.0 | 8.774691e+06 | 0.435944 | 5142126 | 1.010940 | 8.870690e+06 | 0.579676 |
| 35 | 35 | 297188.0 | 1052567.0 | 1052761.0 | 20.580000 | 87.540000 | 15.580000 | 792980.0 | 2.838360e+06 | 0.279380 | 1325935 | 1.011563 | 2.871181e+06 | 0.461808 |
| 36 | 36 | 975853.0 | 4625364.0 | 4719009.0 | 23.213043 | 79.047826 | 14.595652 | 1429356.0 | 2.993239e+06 | 0.477528 | 1708168 | 1.012572 | 3.030870e+06 | 0.563590 |
| 37 | 37 | 589915.0 | 814180.0 | 832576.0 | 25.281818 | 86.231818 | 17.612121 | 3956401.0 | 1.000720e+07 | 0.395355 | 5556330 | 1.012378 | 1.013107e+07 | 0.548444 |
| 38 | 38 | 4877185.0 | 6346105.0 | 6450632.0 | 22.808421 | 77.427368 | 15.713684 | 335484.0 | 8.359487e+05 | 0.401321 | 427775 | 1.011239 | 8.453436e+05 | 0.506037 |
| 39 | 39 | 16986510.0 | 25145561.0 | 26078327.0 | 25.064173 | 77.331890 | 15.723622 | 1340189.0 | 3.551676e+06 | 0.377340 | 1802734 | 1.012327 | 3.595458e+06 | 0.501392 |
| 40 | 40 | 1722850.0 | 2763885.0 | 2854222.0 | 31.079310 | 89.734483 | 12.189655 | 319426.0 | 6.083405e+05 | 0.525078 | 361429 | 1.022879 | 6.222588e+05 | 0.580834 |
| 41 | 41 | 562758.0 | 625741.0 | 625606.0 | 20.785714 | 89.671429 | 15.400000 | 1559129.0 | 4.898659e+06 | 0.318277 | 2283727 | 1.012103 | 4.957948e+06 | 0.460619 |
| 42 | 42 | 3738146.0 | 7994802.0 | 8188656.0 | 21.342105 | 80.989474 | 15.621053 | 4745545.0 | 1.884303e+07 | 0.251846 | 7664208 | 1.011065 | 1.905153e+07 | 0.402288 |
| 43 | 43 | 4793602.0 | 6724540.0 | 6890899.0 | 22.794872 | 87.338462 | 16.187179 | 640495.0 | 1.904889e+06 | 0.336238 | 998897 | 1.012299 | 1.928317e+06 | 0.518015 |
| 44 | 44 | 1599249.0 | 1852994.0 | 1855360.0 | 20.760000 | 79.758182 | 16.810909 | 238521.0 | 4.956763e+05 | 0.481203 | 289931 | 1.009849 | 5.005584e+05 | 0.579215 |
| 45 | 45 | 4891769.0 | 5686986.0 | 5721075.0 | 22.684722 | 88.718056 | 16.384722 | 2189841.0 | 6.288543e+06 | 0.348227 | 3740455 | 1.012283 | 6.365785e+06 | 0.587587 |
| 46 | 46 | 453588.0 | 563626.0 | 576608.0 | 23.382609 | 91.147826 | 14.413043 | 2479409.0 | 5.191690e+06 | 0.477573 | 3006266 | 1.014787 | 5.268460e+06 | 0.570616 |
x10 = nfullh12.drop(columns=['voter_turnout_2010'])
x12 = nfullh12.drop(columns=['voter_turnout_2012'])
y10 = nfullh12['voter_turnout_2010']
y12 = nfullh12['voter_turnout_2012']
x10_train, x10_test, y10_train, y10_test = train_test_split(x10, y10, test_size=0.2, random_state=20)
x12_train, x12_test, y12_train, y12_test = train_test_split(x12, y12, test_size=0.2, random_state=20)
start = time.time()
dtr10 = DecisionTreeRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [7, 10, 12],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8]
}
# define grid search
grid_search = GridSearchCV(estimator=dtr10, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x10_train, y10_train)
# get best estimator
best = grid_search.best_estimator_
print(best)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
dtr10preds = best.predict(x10_test)
mse10 = mean_squared_error(y10_test, dtr10preds)
run10 = time.time()-start
print("Decision Tree Regressor for 2010 cycle, MSE: ", mse10, "with run time of ",run10, ' seconds.')
DecisionTreeRegressor(max_depth=7, min_samples_leaf=8, random_state=40)
max_depth min_samples_leaf min_samples_split
Selected Value 7 8 2
Decision Tree Regressor for 2010 cycle, MSE: 0.001800649267498328 with run time of 1.0939931869506836 seconds.
plt.figure(figsize=(15, 15))
tr = tree.plot_tree(best, feature_names=x10_train.columns, filled=True)
importancedtr10 = best.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancedtr10))], importancedtr10)
plt.show()
start = time.time()
dtr12 = DecisionTreeRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [7,10, 12 ],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8]
}
# define grid search
grid_search = GridSearchCV(estimator=dtr12, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x12_train, y12_train)
# get best estimator
bestdtr12 = grid_search.best_estimator_
print(bestdtr12)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
dtr12preds = bestdtr12.predict(x12_test)
mse12 = mean_squared_error(y12_test, dtr12preds)
run12 = time.time()-start
print("Decision Tree Regressor for 2012 cycle, MSE: ", mse12, "with run time of ",run12, ' seconds.')
DecisionTreeRegressor(max_depth=7, min_samples_leaf=8, random_state=40)
max_depth min_samples_leaf min_samples_split
Selected Value 7 8 2
Decision Tree Regressor for 2012 cycle, MSE: 0.0011470368785370265 with run time of 1.1189322471618652 seconds.
plt.figure(figsize=(15, 15))
tr = tree.plot_tree(bestdtr12, feature_names=x12_train.columns, filled=True)
importancedtr12 = bestdtr12.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancedtr12))], importancedtr12)
plt.show()
start = time.time()
rfr10 = RandomForestRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [2, 3],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=rfr10, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x10_train, y10_train)
# get best estimator
bestrfr10 = grid_search.best_estimator_
print(bestrfr10)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
rfr10preds = bestrfr10.predict(x10_test)
mse10 = mean_squared_error(y10_test, rfr10preds)
run10 = time.time()-start
print("Random Forest Regressor for 2010 cycle, MSE: ", mse10, "with run time of ",run10, ' seconds.')
RandomForestRegressor(max_depth=2, min_samples_leaf=8, random_state=40)
max_depth min_samples_leaf min_samples_split n_estimators
Selected Value 2 8 2 100
Random Forest Regressor for 2010 cycle, MSE: 0.0017958517212420182 with run time of 55.026798248291016 seconds.
importancerfr10 = bestrfr10.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancerfr10))], importancerfr10)
plt.show()
start = time.time()
rfr12 = RandomForestRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [2, 3],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=rfr12, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x12_train, y12_train)
# get best estimator
bestrfr12 = grid_search.best_estimator_
print(bestrfr12)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
rfr12preds = bestrfr12.predict(x12_test)
mse12 = mean_squared_error(y12_test, rfr12preds)
run12 = time.time()-start
print("Random Forest Regressor for 2012 cycle, MSE: ", mse12, "with run time of ",run12, ' seconds.')
RandomForestRegressor(max_depth=2, min_samples_leaf=8, random_state=40)
max_depth min_samples_leaf min_samples_split n_estimators
Selected Value 2 8 2 100
Random Forest Regressor for 2012 cycle, MSE: 0.0012061475209400269 with run time of 59.686084270477295 seconds.
importancerfr12 = bestrfr12.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancerfr12))], importancerfr12)
plt.show()
start = time.time()
adr10 = AdaBoostRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=adr10, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x10_train, y10_train)
# get best estimator
bestadr10 = grid_search.best_estimator_
print(bestadr10)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
adr10preds = bestadr10.predict(x10_test)
mse10 = mean_squared_error(y10_test, adr10preds)
run10 = time.time()-start
print("Adaboost Regressor for 2010 cycle, MSE: ", mse10, "with run time of ",run10, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 1065, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 114, in fit
raise ValueError("learning_rate must be greater than zero")
ValueError: learning_rate must be greater than zero
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [ nan nan -2.78110886 -3.01305966 -2.72880065 -2.64417937]
category=UserWarning,
AdaBoostRegressor(learning_rate=1, n_estimators=150, random_state=40)
learning_rate n_estimators
Selected Value 1 150
Adaboost Regressor for 2010 cycle, MSE: 0.0013454571718047037 with run time of 14.200978755950928 seconds.
importanceadr10 = bestadr10.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importanceadr10))], importanceadr10)
plt.show()
start = time.time()
adr12 = AdaBoostRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=adr12, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x12_train, y12_train)
# get best estimator
bestadr12 = grid_search.best_estimator_
print(bestadr12)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
adr12preds = bestadr12.predict(x12_test)
mse12 = mean_squared_error(y12_test, adr12preds)
run12 = time.time()-start
print("Adaboost Regressor for 2012 cycle, MSE: ", mse12, "with run time of ",run12, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 1065, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 114, in fit
raise ValueError("learning_rate must be greater than zero")
ValueError: learning_rate must be greater than zero
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [ nan nan -0.08162522 -0.07681493 -0.11986831 -0.13357404]
category=UserWarning,
AdaBoostRegressor(learning_rate=0.5, n_estimators=150, random_state=40)
learning_rate n_estimators
Selected Value 0.5 150.0
Random Forest Regressor for 2012 cycle, MSE: 0.0014433330194855497 with run time of 15.569398880004883 seconds.
importanceadr12 = bestadr12.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importanceadr12))], importanceadr12)
plt.show()
start = time.time()
gbr10 = GradientBoostingRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=gbr10, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x10_train, y10_train)
# get best estimator
bestgbr10 = grid_search.best_estimator_
print(bestgbr10)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
gbr10preds = bestgbr10.predict(x10_test)
mse10 = mean_squared_error(y10_test, gbr10preds)
run10 = time.time()-start
print("GradientBoost Regressor for 2010 cycle, MSE: ", mse10, "with run time of ",run10, ' seconds.')
GradientBoostingRegressor(learning_rate=0.5, random_state=40)
learning_rate n_estimators
Selected Value 0.5 100.0
GradientBoost Regressor for 2010 cycle, MSE: 0.0019462548968943766 with run time of 3.213634490966797 seconds.
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
self._check_params()
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 275, in _check_params
"learning_rate must be greater than 0 but was %r" % self.learning_rate
ValueError: learning_rate must be greater than 0 but was 0
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [ nan nan -5.7963179 -5.7963179 -12.80302701
-12.80302701]
category=UserWarning,
importancegbr10 = bestgbr10.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancegbr10))], importancegbr10)
plt.show()
start = time.time()
gbr12 = GradientBoostingRegressor(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=gbr12, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(x12_train, y12_train)
# get best estimator
bestgbr12 = grid_search.best_estimator_
print(bestgbr12)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
gbr12preds = bestgbr12.predict(x12_test)
mse12 = mean_squared_error(y12_test, gbr12preds)
run12 = time.time()-start
print("GradientBoost Regressor for 2012 cycle, MSE: ", mse12, "with run time of ",run12, ' seconds.')
GradientBoostingRegressor(learning_rate=0.5, random_state=40)
learning_rate n_estimators
Selected Value 0.5 100.0
GradientBoost Regressor for 2012 cycle, MSE: 0.002364375245120338 with run time of 2.9606430530548096 seconds.
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
self._check_params()
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 275, in _check_params
"learning_rate must be greater than 0 but was %r" % self.learning_rate
ValueError: learning_rate must be greater than 0 but was 0
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [ nan nan -0.44096898 -0.44096898 -0.76539187 -0.76539187]
category=UserWarning,
importancegbr12 = bestgbr12.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancegbr12))], importancegbr12)
plt.show()
house
| year | state | state_po | state_fips | state_cen | state_ic | office | district | stage | runoff | special | candidate | party | writein | mode | candidatevotes | totalvotes | unofficial | version | fusion_ticket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | bill davenport | democrat | False | total | 58906 | 157170 | False | 20220331 | False |
| 1 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | jack edwards | republican | False | total | 98257 | 157170 | False | 20220331 | False |
| 2 | 1976 | alabama | al | 1 | 63 | 41 | us house | 1 | gen | False | False | writein | unknown | True | total | 7 | 157170 | False | 20220331 | False |
| 3 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | j carole keahey | democrat | False | total | 66288 | 156362 | False | 20220331 | False |
| 4 | 1976 | alabama | al | 1 | 63 | 41 | us house | 2 | gen | False | False | william l "bill" dickinson | republican | False | total | 90069 | 156362 | False | 20220331 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31098 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | lynnette grey bull | democrat | False | total | 66576 | 278503 | False | 20220331 | False |
| 31099 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | overvotes | unknown | False | total | 1274 | 278503 | False | 20220331 | False |
| 31100 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | richard brubaker | libertarian | False | total | 10154 | 278503 | False | 20220331 | False |
| 31101 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | undervotes | unknown | False | total | 6337 | 278503 | False | 20220331 | False |
| 31102 | 2020 | wyoming | wy | 56 | 83 | 68 | us house | 0 | gen | False | False | writein | unknown | True | total | 525 | 278503 | False | 20220331 | False |
31103 rows × 20 columns
clsh = house.groupby(['state', 'year','party'], as_index=False).agg({'candidatevotes': 'sum'})
clsh = clsh[~clsh['state'].isin(['district of columbia'])]
clsh.reset_index(drop=True, inplace=True)
#clsh = clsh.sort_values(['candidatevotes'], ascending=False).groupby(['state', 'year'], as_index=False)
clsh.head(19)
| state | year | party | candidatevotes | |
|---|---|---|---|---|
| 0 | alabama | 1976 | democrat | 667052 |
| 1 | alabama | 1976 | national democrat | 1021 |
| 2 | alabama | 1976 | prohibition | 1111 |
| 3 | alabama | 1976 | republican | 314970 |
| 4 | alabama | 1976 | unknown | 27 |
| 5 | alabama | 1978 | conservative | 3285 |
| 6 | alabama | 1978 | democrat | 439564 |
| 7 | alabama | 1978 | libertarian | 2250 |
| 8 | alabama | 1978 | republican | 197176 |
| 9 | alabama | 1978 | unknown | 4 |
| 10 | alabama | 1980 | democrat | 628133 |
| 11 | alabama | 1980 | libertarian | 22924 |
| 12 | alabama | 1980 | national democratic party of alabama | 1743 |
| 13 | alabama | 1980 | republican | 354224 |
| 14 | alabama | 1980 | statesman | 4650 |
| 15 | alabama | 1980 | unknown | 1952 |
| 16 | alabama | 1982 | democrat | 676584 |
| 17 | alabama | 1982 | libertarian | 11896 |
| 18 | alabama | 1982 | republican | 272510 |
print(type(clsh))
<class 'pandas.core.frame.DataFrame'>
fclsh = clsh.copy()
#fclsh = fclsh.sort_values(['candidatevotes'], ascending=False)
fclsh['label'] = fclsh.groupby(['state', 'year'], as_index=False)['candidatevotes'].transform('sum')
fclsh
| state | year | party | candidatevotes | label | |
|---|---|---|---|---|---|
| 0 | alabama | 1976 | democrat | 667052 | 984181 |
| 1 | alabama | 1976 | national democrat | 1021 | 984181 |
| 2 | alabama | 1976 | prohibition | 1111 | 984181 |
| 3 | alabama | 1976 | republican | 314970 | 984181 |
| 4 | alabama | 1976 | unknown | 27 | 984181 |
| ... | ... | ... | ... | ... | ... |
| 5329 | wyoming | 2020 | constitution | 7905 | 278503 |
| 5330 | wyoming | 2020 | democrat | 66576 | 278503 |
| 5331 | wyoming | 2020 | libertarian | 10154 | 278503 |
| 5332 | wyoming | 2020 | republican | 185732 | 278503 |
| 5333 | wyoming | 2020 | unknown | 8136 | 278503 |
5334 rows × 5 columns
nfc = pd.DataFrame(columns=fclsh.columns)
for st in fclsh.state.unique():
for yr in fclsh.year.unique():
temp = fclsh.groupby(['state', 'year']).get_group((st, yr))
temp['label'] = temp.loc[(temp['candidatevotes'] == temp['candidatevotes'].max())]['party']
nfc = pd.concat([nfc, temp], axis = 0)
nfc
C:\Users\lover\anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """
| state | year | party | candidatevotes | label | |
|---|---|---|---|---|---|
| 0 | alabama | 1976 | democrat | 667052 | democrat |
| 1 | alabama | 1976 | national democrat | 1021 | NaN |
| 2 | alabama | 1976 | prohibition | 1111 | NaN |
| 3 | alabama | 1976 | republican | 314970 | NaN |
| 4 | alabama | 1976 | unknown | 27 | NaN |
| ... | ... | ... | ... | ... | ... |
| 5329 | wyoming | 2020 | constitution | 7905 | NaN |
| 5330 | wyoming | 2020 | democrat | 66576 | NaN |
| 5331 | wyoming | 2020 | libertarian | 10154 | NaN |
| 5332 | wyoming | 2020 | republican | 185732 | republican |
| 5333 | wyoming | 2020 | unknown | 8136 | NaN |
5334 rows × 5 columns
nfc.dropna(inplace=True)
nfc.reset_index(drop=True, inplace=True)
nfc
| state | year | party | candidatevotes | label | |
|---|---|---|---|---|---|
| 0 | alabama | 1976 | democrat | 667052 | democrat |
| 1 | alabama | 1978 | democrat | 439564 | democrat |
| 2 | alabama | 1980 | democrat | 628133 | democrat |
| 3 | alabama | 1982 | democrat | 676584 | democrat |
| 4 | alabama | 1984 | democrat | 821773 | democrat |
| ... | ... | ... | ... | ... | ... |
| 1145 | wyoming | 2012 | republican | 166452 | republican |
| 1146 | wyoming | 2014 | republican | 113038 | republican |
| 1147 | wyoming | 2016 | republican | 156176 | republican |
| 1148 | wyoming | 2018 | republican | 127963 | republican |
| 1149 | wyoming | 2020 | republican | 185732 | republican |
1150 rows × 5 columns
We summed for each state in each election cycle how many votes for each party and added a new column telling the state in that year which label does it have, meaning which party had more votes.
1- we have to change the labels from democrat, republican to D,R accordingly.
2- for SVM we need to copy the dataframe, factorize all columns except for label and scale then apply svm on the new dataframe.
3- apply another classification algorithms : Decision Tree classifier, Random Forest Classifier, Adaboost Classifier, Gradientboost Classifier.
nfc['label'].replace({'democrat': 'D', 'republican': 'R', 'democratic-farmer-labor': 'D', 'independent': 'R'},inplace=True)
nfc
| state | year | party | candidatevotes | label | |
|---|---|---|---|---|---|
| 0 | alabama | 1976 | democrat | 667052 | D |
| 1 | alabama | 1978 | democrat | 439564 | D |
| 2 | alabama | 1980 | democrat | 628133 | D |
| 3 | alabama | 1982 | democrat | 676584 | D |
| 4 | alabama | 1984 | democrat | 821773 | D |
| ... | ... | ... | ... | ... | ... |
| 1145 | wyoming | 2012 | republican | 166452 | R |
| 1146 | wyoming | 2014 | republican | 113038 | R |
| 1147 | wyoming | 2016 | republican | 156176 | R |
| 1148 | wyoming | 2018 | republican | 127963 | R |
| 1149 | wyoming | 2020 | republican | 185732 | R |
1150 rows × 5 columns
nfc['label'].unique()
array(['D', 'R', 'I'], dtype=object)
from sklearn.svm import SVC
svmnfc = nfc.copy()
svmX = svmnfc.drop(columns=['label'])
svmY = svmnfc['label']
for col in svmX:
if svmX[col].dtype == 'object':
svmX[col] = pd.factorize(svmX[col])[0]
svmX = pd.DataFrame(StandardScaler().fit_transform(svmX), columns=svmX.columns)
svmX
| state | year | party | candidatevotes | |
|---|---|---|---|---|
| 0 | -1.697749 | -1.658312 | -0.946657 | -1.730603 |
| 1 | -1.697749 | -1.507557 | -0.946657 | -1.727587 |
| 2 | -1.697749 | -1.356801 | -0.946657 | -1.724572 |
| 3 | -1.697749 | -1.206045 | -0.946657 | -1.721556 |
| 4 | -1.697749 | -1.055290 | -0.946657 | -1.718540 |
| ... | ... | ... | ... | ... |
| 1145 | 1.697749 | 1.055290 | 0.858742 | 1.719484 |
| 1146 | 1.697749 | 1.206045 | 0.858742 | 1.722500 |
| 1147 | 1.697749 | 1.356801 | 0.858742 | 1.725516 |
| 1148 | 1.697749 | 1.507557 | 0.858742 | 1.728532 |
| 1149 | 1.697749 | 1.658312 | 0.858742 | 1.731547 |
1150 rows × 4 columns
svmnfc = pd.concat([svmX, svmY], axis = 1)
svmnfc
| state | year | party | candidatevotes | label | |
|---|---|---|---|---|---|
| 0 | -1.697749 | -1.658312 | -0.946657 | -1.730603 | D |
| 1 | -1.697749 | -1.507557 | -0.946657 | -1.727587 | D |
| 2 | -1.697749 | -1.356801 | -0.946657 | -1.724572 | D |
| 3 | -1.697749 | -1.206045 | -0.946657 | -1.721556 | D |
| 4 | -1.697749 | -1.055290 | -0.946657 | -1.718540 | D |
| ... | ... | ... | ... | ... | ... |
| 1145 | 1.697749 | 1.055290 | 0.858742 | 1.719484 | R |
| 1146 | 1.697749 | 1.206045 | 0.858742 | 1.722500 | R |
| 1147 | 1.697749 | 1.356801 | 0.858742 | 1.725516 | R |
| 1148 | 1.697749 | 1.507557 | 0.858742 | 1.728532 | R |
| 1149 | 1.697749 | 1.658312 | 0.858742 | 1.731547 | R |
1150 rows × 5 columns
Splitting the data into train and test
svX_train, svX_test, svY_train, svY_test = train_test_split(svmX, svmY, random_state=40, test_size=0.2)
start = time.time()
svm = SVC(random_state=40)
# define parameter grid
parameters_grid = {
'gamma': ['auto', 1, 2],
'shrinking': [False, True],
'probability': [False, True],
'tol': [0.0005, 0.00075, 0.001]
}
# define grid search
grid_search = GridSearchCV(estimator=svm, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(svX_train, svY_train)
# get best estimator
bestsvm = grid_search.best_estimator_
print(bestsvm)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
svmpreds = bestsvm.predict(svX_test)
sacc = accuracy_score(svY_test, svmpreds)
runs = time.time()-start
print("SVM Classifier, Accuracy: ", sacc, "with run time of ",runs, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:680: UserWarning: The least populated class in y has only 6 members, which is less than n_splits=10. UserWarning,
SVC(gamma='auto', random_state=40, shrinking=False, tol=0.0005)
gamma probability shrinking tol
Selected Value auto False False 0.0005
SVM Classifier, Accuracy: 1.0 with run time of 35.58691954612732 seconds.
importancesvm = bestsvm.feature_names_in_
# plot feature importance
plt.bar([x for x in range(len(importancesvm))], importancesvm)
plt.show()
start = time.time()
dtc = DecisionTreeClassifier(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [7, 10, 12],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8]
}
# define grid search
grid_search = GridSearchCV(estimator=dtc, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(svX_train, svY_train)
# get best estimator
bestdtc = grid_search.best_estimator_
print(bestdtc)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
dtcpreds = bestdtc.predict(svX_test)
cacc = accuracy_score(svY_test, dtcpreds)
runc = time.time()-start
print("Decision Tree Classifier, Accuracy: ", cacc, "with run time of ",runc, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:680: UserWarning: The least populated class in y has only 6 members, which is less than n_splits=10. UserWarning,
DecisionTreeClassifier(max_depth=7, min_samples_leaf=2, random_state=40)
max_depth min_samples_leaf min_samples_split
Selected Value 7 2 2
Decision Tree Classifier, Accuracy: 1.0 with run time of 1.2300403118133545 seconds.
plt.figure(figsize=(15, 15))
tr = tree.plot_tree(bestdtc, feature_names=svmX.columns, class_names=['D', 'R', 'I'], filled=True)
importancedtc = bestdtc.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancedtc))], importancedtc)
plt.show()
start = time.time()
rfc = RandomForestClassifier(random_state=40)
# define parameter grid
parameters_grid = {
'max_depth': [2, 3],
'min_samples_split': [2, 4],
'min_samples_leaf': [2, 8],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=rfc, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(svX_train, svY_train)
# get best estimator
bestrfc = grid_search.best_estimator_
print(bestrfc)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
rfcpreds = bestrfc.predict(svX_test)
accrfc = accuracy_score(svY_test, rfcpreds)
runrfc = time.time()-start
print("Random Forest Classifier , Accuracy: ", accrfc, "with run time of ",runrfc, ' seconds.')
importancerfc = bestrfc.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancerfc))], importancerfc)
plt.show()
start = time.time()
adc = AdaBoostClassifier(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=adc, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(svX_train, svY_train)
# get best estimator
bestadc = grid_search.best_estimator_
print(bestadc)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
adcpreds = bestadc.predict(svX_test)
accadc = accuracy_score(svY_test, adcpreds)
runadc = time.time()-start
print("Adaboost Classifier , Accuracy: ", accadc, "with run time of ",runadc, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:680: UserWarning: The least populated class in y has only 6 members, which is less than n_splits=10.
UserWarning,
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 486, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 114, in fit
raise ValueError("learning_rate must be greater than zero")
ValueError: learning_rate must be greater than zero
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [nan nan 1. 1. 1. 1.]
category=UserWarning,
AdaBoostClassifier(learning_rate=0.5, n_estimators=100, random_state=40)
learning_rate n_estimators
Selected Value 0.5 100.0
Adaboost Classifier , Accuracy: 1.0 with run time of 26.840372562408447 seconds.
importanceadc = bestadc.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importanceadc))], importanceadc)
plt.show()
start = time.time()
gbc = GradientBoostingClassifier(random_state=40)
# define parameter grid
parameters_grid = {
'learning_rate': [0,0.5,1],
'n_estimators': [100, 150]
}
# define grid search
grid_search = GridSearchCV(estimator=gbc, param_grid=parameters_grid, cv=10)
# fit estimator
grid_search.fit(svX_train, svY_train)
# get best estimator
bestgbc = grid_search.best_estimator_
print(bestgbc)
# print best parameters
print(pd.DataFrame.from_dict(grid_search.best_params_, orient='index', columns=['Selected Value']).T)
gbcpreds = bestgbc.predict(svX_test)
accgbc = accuracy_score(svY_test, gbcpreds)
rungbc = time.time()-start
print("Gradientboost Classifier , Accuracy: ", accgbc, "with run time of ",rungbc, ' seconds.')
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:680: UserWarning: The least populated class in y has only 6 members, which is less than n_splits=10.
UserWarning,
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
self._check_params()
File "C:\Users\lover\anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 275, in _check_params
"learning_rate must be greater than 0 but was %r" % self.learning_rate
ValueError: learning_rate must be greater than 0 but was 0
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lover\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [nan nan 1. 1. 1. 1.]
category=UserWarning,
GradientBoostingClassifier(learning_rate=0.5, random_state=40)
learning_rate n_estimators
Selected Value 0.5 100.0
Gradientboost Classifier , Accuracy: 1.0 with run time of 22.804075002670288 seconds.
importancegbc = bestadc.feature_importances_
# plot feature importance
plt.bar([x for x in range(len(importancegbc))], importancegbc)
plt.show()
nfc10 = nfc.groupby(['year']).get_group((2010))
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('State', fontsize = 15)
ax.set_ylabel('Label', fontsize = 15)
ax.set_title('State Vs. Label in 2010', fontsize = 20, color='cornflowerblue')
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = nfc10['state'] == target
ax.scatter(nfc10.loc[indicesToKeep, 'state']
, nfc10.loc[indicesToKeep, 'label']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
nfc12 = nfc.groupby(['year']).get_group((2012))
fig = plt.figure(figsize = (12,12))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('State', fontsize = 15)
ax.set_ylabel('Label', fontsize = 15)
ax.set_title('State Vs. Label in 2012', fontsize = 20, color='cornflowerblue')
targets = ['california', 'florida', 'south dakota', 'wyoming']
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
indicesToKeep = nfc12['state'] == target
ax.scatter(nfc12.loc[indicesToKeep, 'state']
, nfc12.loc[indicesToKeep, 'label']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()